組合語言（王爽）-實驗16

阿新 • • 發佈：2021-08-10

爬蟲

介紹

# 爬蟲流程
	模擬傳送http請求 ---> 解析資料(清洗資料) ---> 入庫
    
# 百度、谷歌...(大爬蟲)
	百度搜索：輸入關鍵字 ---> 搜的是百度的資料庫 ---> 頁面展示 ---> 點選具體內容 ---> 網頁跳轉
    
	seo優化：主動讓百度爬到你
    
    sem：花錢做廣告買關鍵詞
    
# 爬蟲協議
	哪部分允許爬取，哪部分不允許爬取(https://www.csdn.net/robots.txt)
    
# python中爬蟲相關內容
	模擬傳送http請求(requests，slenium) ---> 解析資料(清洗資料)(json、bs4...) --->入庫
    (檔案、mysql、redis、Excel、MongoDB)
    反爬：
    	封ip  			  --- 代理池
        封賬號 			 --- cookie池
        請求頭中帶特殊校驗	 --- 相應破解出那些欄位
        資料加密		    --- js解析出加密方式，自行組裝資料
        html			   ---  css反爬，字型反爬

requests庫介紹

# requests模組，基於urllib3封裝，方便的發出http請求

# pip install requests

requests傳送get請求

普通請求

res=requests.get('https://www.cnblogs.com/xiaoyuanqujing/articles/11805698.html')

print(res.text) # 返回的資料

search = input('請輸入要搜尋的內容：')
res = requests.get('https://www.baidu.com/s?wd=' + search,
                   headers={
                       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
                       'Host': 'www.baidu.com',
                   })

print(res.text)
with open('search.html','w',encoding='utf-8') as f:
    f.write(res.text)

攜帶引數

import requests
response=requests.get('https://www.sogou.com/web',
                      headers={
                        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36',
                      },params={'query':'美女'})
print(response.text)
with open('search.html','w',encoding='utf-8') as f:
    f.write(response.text)

# url編碼和解碼
from urllib.parse import quote,unquote
# res=quote('美女')
# print(res)  #%E7%BE%8E%E5%A5%B3

res=unquote('%E7%BE%8E%E5%A5%B3')
print(res)


from urllib.parse import urlencode
res=urlencode({'wd':'美女','age':19},encoding='utf-8')
print(res)

攜帶請求頭

# 如果被做了反爬，但是用瀏覽器可以，一定是模擬的不像


header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
        'Cookie':'BIDUPSID=185720F0FEA0DC697147E75D48AFB1D8; PSTM=1593942899; BAIDUID=185720F0FEA0DC69D0675C2EEDB05721:SL=0:NR=10:FG=1; sug=3; ORIGIN=0; bdime=0; sugstore=1; BD_UPN=12314753; __yjs_duid=1_61812ebe639caffca8271e1786971c8b1617936053918; BDUSS=lhDOTR6OWU0UWdmcFBLTDdxRUlqQXJtdFFjajlxfjFhVUpRLTNDNEd0VW51Uk5oSVFBQUFBJCQAAAAAAAAAAAEAAACwPo3XwM~E0Lqiyc-6o9Cjx~gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACcs7GAnLOxgW; BDUSS_BFESS=lhDOTR6OWU0UWdmcFBLTDdxRUlqQXJtdFFjajlxfjFhVUpRLTNDNEd0VW51Uk5oSVFBQUFBJCQAAAAAAAAAAAEAAACwPo3XwM~E0Lqiyc-6o9Cjx~gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACcs7GAnLOxgW; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID_BFESS=uQLOJeC62GhrmIcHWL4ru7XUvDmJR3TTH6aoUKcPVTiblG6zzmh3EG0Pbf8g0K4bdMXhogKK0eOTHkuF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tRk8oK-aJKvbfP0kKno_MtCsqxby26n9-Rb9aJ5y-J7nhMTz5Mn1DT_OQl_fXpQq5m3ion3vQpbZ8h5D34vW-fLRDmct-p5MQ26xKl0MLPbcsU5nBU4VhnkD2fnMBMPj5mOnaIQc3fAKftnOM46JehL3346-35543bRTLnLy5KJYMDFCjTA-D6QyeUbQa4JWHD6QB4TaajrjDnCrBPjUXUI82h5y05OkbmteaU3PJMnhMUna54ovynKZDnORXx745j5b-bA-Bh3tfKJKbPQ63ML1Db3JqP7M0aQtsCouan3oepvoD-oc3MvByPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEJJkO_D_atKvDqTrP-trf5DCShUFsWPKJB2Q-XPoO3KJZfqRhyhJIjpk0jn7P-tQiW5cpoMbgylRM8P3y0bb2DUA1y4vpK-ogQgTxoUJ2fnRJEUcGqj5Ah--ebPRiJPQ9Qg-qahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hD0wjT0-DjcM-Uv05-PXKCJ0X458HJOoDDvFqfbcy4LdjG5NeRvbLnc7-hRu2PKboM5Cbxbmj4Pu3-Aq54RIL5505tnqtMcNb-0xeJrhQfbQ0bjuqP-jW5Ta-qI-HR7JOpkxbfnxy-P0QRPH-Rv92DQMVU52QqcqEIQHQT3m5-5bbN3ht6IHJJIq_I82JIvbfP0k5R35hnjH-UIs-lorB2Q-5KL-3bnKDqTnyhJdjbD0jn7P-f3LWHue-UbdJJjoSqvn0hjxMtDjQNjEhtr3t2TxoUJt5DnJhhkm-4OYW-kebPRiJPQ9QgbWLlQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0M5DK0HPonHjAKDjvP; delPer=0; BD_CK_SAM=1; PSINO=3; BAIDUID_BFESS=185720F0FEA0DC69D0675C2EEDB05721:SL=0:NR=10:FG=1; H_PS_PSSID=34300_34335_34273_31254_34377_33848_34092_34107_34111_26350_34360; COOKIE_SESSION=3698_4_8_9_18_23_0_1_7_3_0_11_2964_0_0_0_1628046671_1627963945_1628050426%7C9%23101_72_1627963943%7C9; H_PS_645EC=4707Tvcdepk6pvKnFnabHvwqrLGAFZiyVAOXDTeK8IdAgRrAQD714rlnFSA; BA_HECTOR=0ha5al208l240k2hf91ggk5e30q'}
res = requests.get('https://www.baidu.com/s?wd=帥哥',headers=header)
print(res.text)

with open('search.html','w',encoding='utf-8') as f:
    f.write(res.text)

攜帶cookie

res = requests.get('http://www.aa7a.cn/', headers={
    # 'cookie': 'ECS_ID=b435f5897f41c2f2c322fa3065165c9fbc56ddd5; ECS[visit_times]=1; _jzqa=1.4423902263705487400.1628049030.1628049030.1628049030.1; _jzqc=1; _jzqy=1.1628049030.1628049030.1.jzqsr=baidu.-; _jzqckmp=1; UM_distinctid=17b0f48bcda509-0e53827f49b667-5e422810-1fa400-17b0f48bcdb54c; CNZZDATA4603183=cnzz_eid%3D1414483188-1628045968-null%26ntime%3D1628045968; Hm_lvt_c29657ca36c6c88e02fed9a397826038=1628049030; CNZZDATA1260462072=271803043-1628045968-null%7C1628045968; Qs_lvt_201322=1628049030; mediav=%7B%22eid%22%3A%22179539%22%2C%22ep%22%3A%22%22%2C%22vid%22%3A%22%22%2C%22ctn%22%3A%22%22%2C%22vvid%22%3A%22%22%2C%22_mvnf%22%3A1%2C%22_mvctn%22%3A0%2C%22_mvck%22%3A1%2C%22_refnf%22%3A0%7D; _qzjc=1; __xsptplusUT_422=1; __xsptplus422=422.1.1628049032.1628049032.1%234%7C%7C%7C%7C%7C%23%23HXwWiieCoDk4evQa5H5dKIEBtnxLTY12%23; ECS[username]=616564099%40qq.com; ECS[user_id]=61399; ECS[password]=4a5e6ce9d1aba9de9b31abdf303bbdc2; _qzja=1.1591066928.1628049030327.1628049030327.1628049030327.1628049032756.1628049055229.616564099%2540qq_com.1.0.3.1; _qzjb=1.1628049030327.3.0.0.0; _qzjto=3.1.0; _jzqb=1.8.10.1628049030.1; Qs_pv_201322=2246468261428716000%2C1231523507243942000; Hm_lpvt_c29657ca36c6c88e02fed9a397826038=1628049055; cto_bundle=Gd60IF9TJTJGRHFuTzdidXZYZGEyVW9ydFFJV25YY0RqSlBRODRlTDdjSG9RT01NUlg4NmYyVjhPMzNmenolMkJDMlRiQjJWTHA2UlBoUUdNOGtBTnoyTkZqdmJMOEI5Vk14aVU4JTJGbHdyTXFqaCUyRlY1dWt3JTNE'
})
print('[email protected] ' in res.text)

requests傳送post請求

自動登入某網站

res = requests.post('http://www.aa7a.cn/user.php', data={
    'username': '[email protected]',
    'password': 'lqz123',
    'captcha': 'zxv7',
    'remember': 1,
    'ref': 'http://www.aa7a.cn/',
    'act': 'act_login'
})

# print(res.text)
## 取出cookie,登入成功的cookie
cookie=res.cookies  # CookieJar物件
print(cookie)


res2=requests.get('http://www.aa7a.cn/',cookies=cookie)
# res2=requests.get('http://www.aa7a.cn/')
print('[email protected]' in res2.text)

body體中攜帶資料

### 6 body體攜帶資料
# res = requests.post('',data={})  # urlencoded方式
# res = requests.post('',json='json格式字串')  # aplication/json方式
# res = requests.post('',json='',headers={
# 'content-type': 'application/json;charset=utf-8'
# })

response屬性、編碼問題，獲取二進位制，解析json

## 7  response屬性，


# respone=requests.get('http://www.aa7a.cn/')
#
#
# print(respone.text)   # 響應體的字串
# print('----------------------------------')
# print(respone.content) # 響應體的二進位制（圖片，視訊，頁面）
# print('----------------------------------')
# print(respone.status_code) # 響應的狀態碼
# print(respone.headers)     # 響應頭
# print(respone.cookies)     # 返回的cookie
# print(respone.cookies.get_dict())  # cookieJar物件轉成字典
# print(respone.cookies.items())    # 相當於字典的items
#
# print(respone.url)               # 當次請求地址
# print(respone.history)           # 重定向過才有值
#
# print(respone.encoding)          # 響應的編碼格式

#關閉：response.close()
# from contextlib import closing
# with closing(requests.get('xxx',stream=True)) as response:
#     for line in response.iter_content():
#     pass


# 8 編碼問題，
# 可能會遇到列印respone.text出現亂碼，在瀏覽器頁面中看不會出現亂碼
# respone=requests.get('http://www.aa7a.cn/')
# # respone.encoding='gbk'  # 修改編碼方式
# respone.encoding=respone.apparent_encoding   # 頁面使用的編碼方式
# print(respone.text)   # 響應體的字串



# 9 獲取二進位制，
res=requests.get('http://www.aa7a.cn/data/afficheimg/20201102gophex.png')
# print(res.content)
# with open('致命誘惑.png','wb') as f:
#     f.write(res.content)


# with open('致命誘惑.png','wb') as f:
#     for line in res.iter_content(1024):
#         f.write(line)



# 10 解析json
# import json
# res=requests.get('https://api.luffycity.com/api/v1/course/category/actual/?courseType=actual')
# # print(json.loads(res.text))
#
# print(res.json())

爬取視訊

#  爬取視訊
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0

# import re
# res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0')
#
# # print(res.text)
# # 如果使用bs4，非常簡單
#
# video_list=re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',res.text)
# # print(video_list)
# for video in video_list:
#     video_url='https://www.pearvideo.com/'+video
#     # print(video_url)
#     video_id=video.split('_')[-1]
#
#     header={
#         'Referer':video_url
#     }
#
#     res2=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.5165499193941832'%video_id,headers=header)
#
#     video_f_url=res2.json()['videoInfo']['videos']['srcUrl']
#     video_real_url=video_f_url.replace(video_f_url.rsplit('/')[-1].split('-')[0], 'cont-%s' % video_id)
#     print(video_real_url)
#
#     res3=requests.get(video_real_url)
#     with open('%s.mp4'%video_id,'wb') as f:
#         for line in res3.iter_content(1024):
#             f.write(line)









# 分析過程稿
# referer:上一次訪問的地址，可以做圖片防盜鏈
# header={
#     'Referer': 'https://www.pearvideo.com/video_1737590'
# }
#
# res=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=1737590&mrd=0.5165499193941832',headers=header)
# print(res.text)



## 可以播放的視訊
# 'https://video.pearvideo.com/mp4/short/20210729/cont-1736870-15732687-hd.mp4'
# ## 不可以播放的視訊
# 'https://video.pearvideo.com/mp4/short/20210729/1628062847275-15732687-hd.mp4'
#
#
# 'https://video.pearvideo.com/mp4/short/20210729/   cont-1736870   -15732687-hd.mp4'
# 'https://video.pearvideo.com/mp4/short/20210729/   1628062847275   -15732687-hd.mp4'
#
# s='https://video.pearvideo.com/mp4/short/20210729/  1628062847275 -15732687-hd.mp4'
# s.replace(s.rsplit('/')[-1].split('-')[0],'cont-%s'%video_id)

補充

# 長鏈轉短鏈服務
	核心：重定向

requests高階用法

補充

1 正向代理和反向代理
	正向代理：代理客戶端
    反向代理：代理服務端(nginx就是反向代理伺服器)
    
2 requests使用的代理：正向代理

SSL Cert Verification（瞭解）

## 不驗證證書
# import requests
# respone=requests.get('https://www.12306.cn',verify=False) #不驗證證書,報警告,返回200
# print(respone.status_code)
#
#
# ## 攜帶證書
# import requests
# respone=requests.get('https://www.12306.cn',
#                      cert=('/path/server.crt',
#                            '/path/key'))
# print(respone.status_code)

使用代理

# import requests
# proxies = {
#     'http':'http://117.69.230.132:3256',
# }
# respone=requests.get('https://www.12306.cn',
#                      proxies=proxies)
#
# print(respone.status_code)

# import requests
# proxies = {
#     'http':'http://117.69.230.132:3256',
# }
# # respone=requests.get('http://127.0.0.1:8000',proxies=proxies)
# respone=requests.get('http://127.0.0.1:8000')
#
# print(respone.text)


# import requests
# proxies = {
#     'http':'http://103.228.245.98:3128',
# }
# respone=requests.get('http://101.133.225.166:8888/',proxies=proxies)
# # respone=requests.get('http://101.133.225.166:8888/')
#
# print(respone.text)

## 如果你有很多代理，每次發請求，隨機取一個代理ip，傳送，這樣我們的ip就不會被封
### 花錢買
### 白嫖
# import requests
# res=requests.get('http://demo.spiderpy.cn/get/').json()['proxy']
# print(res)
#
# proxies = {
#     'https':'https://%s'%res,
# }
# print()
#
# respone=requests.get('http://www.baidu.com',proxies=proxies)
# # respone=requests.get('http://www.baidu.com')

# print(respone.text)



### 藉助於第三方，自己搭建(讀一讀人家原始碼)
#https://github.com/jhao104/proxy_pool

超時時間

# respone=requests.get('https://www.baidu.com',timeout=0.0001)

認證（像老款路由器的登入）

# import requests
# from requests.auth import HTTPBasicAuth
# r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
# print(r.status_code)

異常處理

# import requests
# from requests.exceptions import * #可以檢視requests.exceptions獲取異常型別
#
# try:
#     r=requests.get('http://www.baidu.com',timeout=0.00001)
# # except ReadTimeout:
# #     print('===:')
# # except ConnectionError: #網路不通
# #     print('-----')
# # except Timeout:
# #     print('aaaaa')
#
# except Exception:
#     print('Error')

檔案上傳

# import requests
# files={'myfile':open('1 自動處理cookie.py','rb')}
# respone=requests.post('http://127.0.0.1:8000/upload_file/',files=files)
# print(respone.text)

抽屜自動點贊

import requests

header={
    'Cookie':'',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
}
# res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'31857081'},headers=header)
# print(res.text)


# 所有的都點贊，----》id解析---》bs4模組（解析xml）

res=requests.get('https://dig.chouti.com/top/24hr?_=1628136305346',headers=header).json()
for item in res['data']:
    id=item['id']
    res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'%s'%id},headers=header)
    print(res.text)

爬取汽車之家新聞

# pip3 install beautifulsoup4
import requests
from bs4 import BeautifulSoup

for i in range(1,100):
    res=requests.get('https://www.autohome.com.cn/news/%s/#liststart'%i)
    # print(res.text)
    # 第一個引數，要解析的內容，第二引數：使用的解析器  html.parser  bs4內建的解析器   lxml
    soup=BeautifulSoup(res.text,'html.parser')

    # pip3 install lxml
    # soup=BeautifulSoup(res.text,'lxml')


    # find_all找所有
    ul_list=soup.find_all(name='ul',class_='article')
    # ul_list=soup.find_all(name='ul')
    # print(len(ul_list))

    for ul in ul_list:
        li_list=ul.find_all(name='li')
        for li in li_list:
            h3=li.find(name='h3')
            if h3:
                title=h3.text   # 獲取標籤的文字內容，標籤物件.text
                # print(title)
                desc=li.find(name='p').text
                # print(desc)
                img_url=li.find(name='img')['src']
                if not img_url.startswith('http'):
                    img_url='https:'+img_url
                # print(img_url)
                url='https:'+li.find(name='a')['href']
                print(url)

                print('''
                新聞標題：%s
                新聞摘要：%s
                新聞圖片：%s
                新聞地址：%s          
                '''%(title,desc,img_url,url))

bs4遍歷文件樹


from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story <span>lqz</span></b><span>egon</span></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup=BeautifulSoup(html_doc,'html.parser')


# res=soup.prettify()  # 美化
# print(res)

#1、用法
# html=soup.html
# title=soup.html.head.title
# title=soup.title
# print(title)



#2、獲取標籤的名稱 ---> 標籤物件.name
# a=soup.body.a
# a=soup.a.name
# print(a)
# print(soup.body.name)


#3、獲取標籤的屬性  ---->標籤物件['標籤名']
# href=soup.body.a['href']
# attrs=soup.body.a.attrs  # 所有屬性，---》字典
# href=soup.body.a.attrs['href']
# print(attrs['class'])

# c=soup.p.attrs['class']
# print(c)

#4、獲取標籤的內容

# res=soup.b.text  # 拿到當前標籤子子孫所有的text
# res=soup.p.text

# res=soup.p.string # 當前標籤有且只有一個文字內容才能拿出來
# res=soup.b.string # 當前標籤有且只有一個文字內容才能拿出來

# res=soup.p.strings   # 把子子孫放到生成器中
#
# print(list(res))



#5、巢狀選擇
# res=soup.html.body.p
# print(type(res))  # bs4.element.Tag
from bs4.element import Tag


####瞭解
#6、子節點、子孫節點
# print(soup.p.contents) #p下所有子節點，放到列表中

# print(soup.p.children) #得到一個迭代器,包含p下所有子節點

# for i,child in enumerate(soup.p.children):
#     print(i,child)

# print(soup.p.descendants) #獲取子孫節點,p下所有的標籤都會選擇出來
# for i,child in enumerate(soup.p.descendants):
#     print(i,child)


#7、父節點、祖先節點

# print(soup.a.parent) #獲取a標籤的父節點

# print(soup.body.parent)

# print(soup.a.parents) #找到a標籤所有的祖先節點，父親的父親，父親的父親的父親...
# print(list(soup.a.parents))
# print(len(list(soup.a.parents)))


#8、兄弟節點
# print(soup.a.next_sibling) #下一個兄弟
# print(soup.a.previous_sibling) #上一個兄弟
#
# print(list(soup.a.next_siblings)) #下面的兄弟們=>生成器物件
# print(list(soup.a.previous_siblings)) #上面的兄弟們=>生成器物件

bs4搜尋文件樹

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body id='body'>
<p class="title"><b>The Dormouse's story <span>lqz</span></b><span>egon</span></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup=BeautifulSoup(html_doc,'html.parser')


# 搜尋文件樹  find  find_all

# 五種過濾器: 字串、正則表示式、列表、True、方法


##### 字串
# res=soup.find(name='body')
# res=soup.find(name='p',class_='story')

# 查詢id為link2的標籤
# res=soup.find(id='link2',name='a',class_='sister',href='http://example.com/lacie')
# res=soup.find(href='http://example.com/lacie')
# print(res)

# res=soup.find(attrs={'class':['sister']})
# print(res)


#### 正則表示式
import re
# res=soup.find_all(name=re.compile('^b')) #找出b開頭的標籤，結果有body和b標籤
# res=soup.find(name=re.compile('^b'))


# res=soup.find_all(class_=re.compile('^s'))
# res=soup.find_all(href=re.compile('^http'))
# res=soup.find_all(id=re.compile('^l'))
# print(res)


####列表、

# res=soup.find_all(name=['body','b'])
# res=soup.find_all(id=['link1','link2'])

# res=soup.find_all(attrs={'id':['link1','link2']})
#
# print(res)

# True、

# links=soup.find_all(href=True)
# print(links)

# res=soup.find_all(name=True)
# res=soup.find_all(id=True)
# print(res)



#方法
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
#
# print(len(soup.find_all(name=has_class_but_no_id)))


# 拿出當前頁面所有圖片
soup.find_all(name='img',href=True)



## 建議 遍歷文件樹和搜尋文件樹混用
# soup.body.div.find




### 其他引數  find，find_all

#limit
# soup.find()
# res=soup.find_all(name='a',href=True,limit=2)  # 限制獲取的條數
# print(res)


# recursive 是否遞迴查詢
# res=soup.find_all(name='a',recursive=False)
# res=soup.find_all(name='html',recursive=False)
# print(res)

css選擇器(與xpath是通用的)

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
    <b>The Dormouse's story  <p>asdfasdf</p></b>
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
        <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    <div class='panel-1'>
        <ul class='list' id='list-1'>
            <li class='element'>Foo</li>
            <li class='element'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
        <ul class='list list-small' id='list-2'>
            <li class='element'><h1 class='yyyy'>Foo</h1></li>
            <li class='element xxx'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
    </div>
    and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'html.parser')


'''
#id
.類名
標籤
標籤>標籤
標籤 標籤
'''

# res=soup.p.select('.sister')  # 使用css選擇器
# res=soup.p.select('#link1')  # 使用css選擇器
# res=soup.select('body>p')  # 使用css選擇器 body的子標籤p
res=soup.select('body p')  # 使用css選擇器 body的子子孫孫標籤p
print(len(res))


### css選擇器是通用的：bs4，lxml解析也可以是css選擇器

##css選擇器不會寫怎麼辦？
'#maincontent > div:nth-child(3) > table > tbody > tr:nth-child(13) > td:nth-child(3)'

## xpath選擇
'//*[@id="maincontent"]/div[2]/table/tbody/tr[18]/td[2]'

selenium使用

# 如果使用requests模組，傳送請求獲取的資料不全，它不能執行js

# selenium:可以使用程式碼控制模擬人操作瀏覽器


## 操作某個瀏覽器，就需要有瀏覽器驅動
# http://npm.taobao.org/mirrors/chromedriver/  谷歌驅動的淘寶映象站
# 谷歌瀏覽器版本要跟驅動版本對應

## 92.0.4515.131  下載相應版本驅動，放到專案程式碼中

# pip3 install selenium

# from selenium import webdriver
# import time
# # 開啟一個谷歌瀏覽器
# bro=webdriver.Chrome(executable_path='chromedriver.exe')
#
# #位址列中輸入百度
# bro.get('https://www.cnblogs.com/')
#
# time.sleep(2)
#
# print(bro.page_source)  #當前頁面的html內容
#
# bro.close()  # 關閉瀏覽器


# import requests
#
# res=requests.get('https://dig.chouti.com/',headers={
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
# })
# print(res.text)

基本使用

from selenium import webdriver
import time

# 瀏覽器物件
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)  # 隱式等待，去找控制元件，如果沒有會等10s

bro.get('https://www.baidu.com/')

# sub_button=bro.find_element_by_css_selector('#s-top-loginbtn')
sub_button = bro.find_element_by_id('s-top-loginbtn')  # 如果有id，優先用它
# 點選
sub_button.click()

# 找到使用者名稱密碼登入
user_btn = bro.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__footerULoginBtn"]')
# user_btn=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
user_btn.click()

username = bro.find_element_by_id('TANGRAM__PSP_11__userName')
password = bro.find_element_by_id('TANGRAM__PSP_11__password')

# 往輸入框中寫東西
username.send_keys('[email protected]')
password.send_keys('lqz12345')

sumbit_btn = bro.find_element_by_id('TANGRAM__PSP_11__submit')
time.sleep(3)
sumbit_btn.click()

time.sleep(3)
bro.close()

無頭瀏覽器


from selenium import webdriver

from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定瀏覽器解析度
chrome_options.add_argument('--disable-gpu') #谷歌文件提到需要加上這個屬性來規避bug
chrome_options.add_argument('--hide-scrollbars') #隱藏滾動條, 應對一些特殊頁面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不載入圖片, 提升速度
chrome_options.add_argument('--headless') #瀏覽器不提供視覺化頁面. linux下如果系統不支援視覺化不加這條會啟動失敗


driver=webdriver.Chrome(executable_path='chromedriver.exe',chrome_options=chrome_options)
driver.get('https://www.baidu.com')
print(driver.page_source)
driver.close()

獲取元素位置，屬性，大小

from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://kyfw.12306.cn/otn/resources/login.html')
driver.implicitly_wait(10)

user_login=driver.find_element_by_css_selector('.login-hd-account>a')

user_login.click()
time.sleep(2)
img=driver.find_element_by_id('J-loginImg')
print(img)

print(img.id)    #selenium提供的id，忽略
print(img.tag_name) # 標籤名



print('-----')
print(img.location) # img標籤的位置
print(img.size)     # img標籤大小

# 獲取屬性
# print(img.get_attribute('src'))
print(img.get_attribute('class'))

driver.close()

等待元素被載入

from selenium import webdriver

# 兩種等待方式
# 顯示等待
# 隱式等待：只需要寫一句話，等待所有要獲取的標籤

driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
'''
# 兩種等待方式
# 顯示等待(忽略掉)
    wait=WebDriverWait(driver,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left')))
    contents=browser.find_element(By.CSS_SELECTOR,'#content_left')
# 隱式等待：
    -driver.implicitly_wait(10)
    -driver.find_element_by_css_selector()
    -只需要寫一句話，等待所有要獲取的標籤

'''

driver.implicitly_wait(10)


print(driver.page_source)
# 再找控制元件，只要沒載入成功，就會等待，最多等10s
driver.close()

元素操作

from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
driver.implicitly_wait(10)

## 點選，清空，輸入操作

input_search=driver.find_element_by_id('kw')
input_search.send_keys('美女')  # 輸入
time.sleep(3)
input_search.clear() # 清空

time.sleep(2)
input_search.send_keys('性感美女')
time.sleep(2)
btn=driver.find_element_by_id('su')
btn.click()  # 點選
time.sleep(10)

driver.close()

執行js

from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('http://127.0.0.1:8000/')
driver.implicitly_wait(10)

driver.execute_script("name='egon';") # 這裡面寫js程式碼
driver.execute_script("alert(name)") # 這裡面寫js程式碼


time.sleep(5)
# driver.close()

切換選項卡

import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')

print(browser.window_handles) #獲取所有的選項卡
# browser.switch_to_window(browser.window_handles[1])
# browser.switch_to_window(browser.window_handles[1])
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(5)
# browser.switch_to_window(browser.window_handles[0])
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()

模擬前進後退

import time
from selenium import webdriver

browser=webdriver.Chrome(executable_path='chromedriver.exe')
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')

browser.back()
time.sleep(3)
browser.forward()
browser.close()

異常處理

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

browser = webdriver.Chrome()
try:
    browser.get('http://www.baidu.com')

except Exception as e:
    print(e)
finally:
    browser.close()

selenium登入cnblogs獲取cookie

#selenium登入cnblogs獲取cookie
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
import time
import json
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.implicitly_wait(10)

####  登入過程
# try:
#     browser.get('http://www.cnblogs.com')
#     submit_btn=browser.find_element_by_link_text('登入')  # a標籤的內容
#     submit_btn.click()
#
#     username=browser.find_element_by_id('mat-input-0')
#     password=browser.find_element_by_id('mat-input-1')
#     username.send_keys('[email protected]')
#     password.send_keys('1111')
#     input('等會')
#     sub_btn=browser.find_element_by_css_selector('body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper')
#     sub_btn.click()
#
#     # 人工參與，滑動
#     input('等會')
#
#     # 獲取到登入後的cookie
#     print(browser.get_cookies())
#
#     with open('cookie.json','w') as f:
#         json.dump(browser.get_cookies(),f)
#
#
# except Exception as e:
#     print(e)
# finally:
#     browser.close()


### 不登入了，把cookie寫入瀏覽器
# browser.get('http://www.cnblogs.com')
# with open('cookie.json','r') as f:
#     cookie=json.load(f)
# time.sleep(5)
# for item in cookie:  # 設定cookie必須用字典，cookie的json檔案是列表，所以用迴圈往裡放
#     browser.add_cookie(item)
#
#
#
# browser.refresh()  # 重新整理頁面
#
# time.sleep(5)
#
# browser.close()

抽屜半自動點贊

from selenium import webdriver
import json
import time

#### 登入過程
# bro=webdriver.Chrome(executable_path='chromedriver.exe')
# bro.implicitly_wait(10)
# bro.get('https://dig.chouti.com/')
# try:
#     sub_btn=bro.find_element_by_id('login_btn')
#     print(sub_btn)
#
#     # sub_btn.click()  # 報錯
#     bro.execute_script('arguments[0].click();',sub_btn)
#
#     # username=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')
#     username=bro.find_element_by_css_selector('div.input-item>input.login-phone')
#     username.send_keys('18953675221')
#     # password=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div')
#     password = bro.find_element_by_css_selector('div.input-item>input.pwd-password-input')
#     password.send_keys('lqz123')
#
#     time.sleep(3)
#     btn=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')
#
#     btn.click()
#
#     input('等')
#
#     with open('chouti.json','w') as f:
#         json.dump(bro.get_cookies(),f)
#
#
#
#
# finally:
#     bro.close()
import requests

bro=webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)
bro.get('https://dig.chouti.com/')



# 把螢幕滑倒最底下
bro.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# bro.find_elements_by_css_selector('.link-item')
cookie={}
##從檔案中讀出cookie
with open('chouti.json','r') as f:
    res=json.load(f)
for item in res:
    cookie[item['name']]=item['value']

print(cookie) # requests能夠使用的cookie


div= bro.find_element_by_class_name('link-con')
time.sleep(2)
header={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
div_list=div.find_elements_by_class_name('link-item')
for div in div_list:
    article_id=div.get_attribute('data-id')
    print(article_id)
    # 使用requests傳送請求
    res=requests.post('https://dig.chouti.com/link/vote',data={'linkId': article_id},cookies=cookie,headers=header)
    print(res.text)
bro.close()

打碼平臺使用

# 人工破解
# 影象識別模組---》數字，字母組合
# 驗證碼破解平臺---》雲打碼，超級鷹
	-給它一張圖片---》結果返回   （收費的）

    
    
#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5


class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 圖片位元組
        codetype: 題目型別 參考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:報錯題目的圖片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
    chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')  # 使用者中心>>軟體ID 生成一個替換 96001
    im = open('a.jpg', 'rb').read()  # 本地圖片檔案路徑 來替換 a.jpg 有時WIN系統須要//
    print(chaojiying.PostPic(im, 1902))  # 1902 驗證碼型別  官方網站>>價格體系 3.4+版 print 後要加()

xpath使用

1 一門在html中查詢資料的語言
2 記住的語法：
	/   取當前路徑下的xx   
    //  取所有路徑下的xx   
    .   當前路徑    
    ..   上一層
	@    取屬性
    
4 lxml解析模組提供的xpath
doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' name='sss'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html' name='lqz'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
from lxml import etree

# 傳入要解析的內容
html=etree.HTML(doc)

# res=html.xpath('//body')
# print(res)

# 1 所有節點
# a=html.xpath('//*')




# 2 指定節點（結果為列表）
# a=html.xpath('//head')
# 3 子節點，子孫節點
# a=html.xpath('//div/a')
# a=html.xpath('//body//a') #無資料
# a=html.xpath('//body//a')
# 4 父節點
# a=html.xpath('//body//a[@href="image1.html"]/..')
# a=html.xpath('//body//a')
# a=html.xpath('//body//a[@href="image1.html"]')
# a=html.xpath('//body//a[1]/..')
# 也可以這樣
# a=html.xpath('//body//a[1]/parent::*')
# a=html.xpath('//body//a[1]/parent::p')
# 5 屬性匹配
# a=html.xpath('//a[@href="image1.html"]')
# a=html.xpath('//a[@name="sss"]')

# 6 文字獲取  text()
# a=html.xpath('//a[@href="image1.html"]/text()')
# a=html.xpath('//a/text()')

# 7 屬性獲取
# a=html.xpath('//a/@href')
# a=html.xpath('//a[1]/@name')
# # 注意從1 開始取（不是從0）
# a=html.xpath('//body//a[2]/@href')
# 8 屬性多值匹配
#  a 標籤有多個class類，直接匹配就不可以了，需要用contains
# a=html.xpath('//a[@class="li"]')
# a=html.xpath('//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多屬性匹配
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序選擇
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# a=html.xpath('//a[2]/@name')
# 取最後一個
# a=html.xpath('//a[last()]/@href')
# 位置小於3的
# a=html.xpath('//a[position()<3]/@href')
# 倒數第二個
# a=html.xpath('//a[last()-2]/@href')
# 11 節點軸選擇
# ancestor：祖先節點
# 使用了* 獲取所有祖先節點
# a=html.xpath('//a/ancestor::*')
# # 獲取祖先節點中的div
# a=html.xpath('//a/ancestor::div')
# attribute：屬性值
# a=html.xpath('//a[1]/attribute::*')
# child：直接子節點
# a=html.xpath('//a[1]/child::*')
# a=html.xpath('//a[1]/child::img/@src')
# descendant：所有子孫節點
# a=html.xpath('//a[6]/descendant::*')
# following:當前節點之後所有節點
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:當前節點之後同級節點
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]/text()')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')

print(a)

自動登入12306

from selenium import webdriver
import base64
from PIL import Image
import time
from chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains


# 不讓程式檢測出是用驅動控制
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
bro=webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=options)

bro.get('https://kyfw.12306.cn/otn/resources/login.html')

bro.implicitly_wait(10)
# 把視窗設定全屏
bro.maximize_window()

try:
    username_login_btn=bro.find_element_by_css_selector('.login-hd-account>a')
    username_login_btn.click()

    username=bro.find_element_by_id('J-userName')
    password=bro.find_element_by_id('J-password')
    login_btn=bro.find_element_by_id('J-login')
    username.send_keys('liuqingzheng')
    password.send_keys('lqz12345')


    img_code=bro.find_element_by_id('J-loginImg')
    print(img_code.size)
    print(img_code.location)
    # 獲取驗證碼圖片的兩種方案
    # 方案一：整體截圖，根據位置摳出驗證碼圖片
    # bro.save_screenshot('main.png') # 對整個頁面進行截圖，main.png
    #
    # location=img_code.location
    # size=img_code.size
    # print(location)
    # print(size)
    # #驗證碼的座標
    # img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
    # #使用pillow開啟截圖
    # img=Image.open('./main.png')
    # #從截圖中按照位置扣除驗證碼
    # code_img=img.crop(img_tu)
    # # 把扣出來的圖，儲存到本地
    # code_img.save('./code2.png')

    # 方案二：把圖片的base64編碼轉成圖片儲存到本地
    img_base64=img_code.get_attribute('src')
    img_base64_real=img_base64.split(',')[-1]
    img_1=base64.b64decode(img_base64_real)
    with open('code.jpg','wb') as f:
        f.write(img_1)


    # 呼叫超級鷹，完成驗證碼破解
    # 呼叫超級鷹識別
    chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')  # 使用者中心>>軟體ID 生成一個替換 96001
    im = open('code.jpg', 'rb').read()  # 本地圖片檔案路徑 來替換 a.jpg 有時WIN系統須要//
    res=chaojiying.PostPic(im, 9004)  # 1902 驗證碼型別  官方網站>>價格體系 3.4+版 print 後要加()
    # 123,155|42,135|11,77---->[[123,155],[42,135],[11,77]]
    print(res)
    result=res['pic_str']
    all_list = []
    if '|' in result:
        list_1 = result.split('|')
        count_1 = len(list_1)
        for i in range(count_1):
            xy_list = []
            x = int(list_1[i].split(',')[0])
            y = int(list_1[i].split(',')[1])
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
    else:
        x = int(result.split(',')[0])
        y = int(result.split(',')[1])
        xy_list = []
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
    print(all_list)


    ### 在頁面中點選破解的圖案
    #點選  [[123,155],[42,135],[11,77]]
    for item in all_list:
        ActionChains(bro).move_to_element_with_offset(img_code,item[0],item[1]).click().perform()
        time.sleep(1)

    time.sleep(5)
    login_btn.click()
    time.sleep(1)

    # 滑動滑塊
    span=bro.find_element_by_id('nc_1_n1z')
    ActionChains(bro).drag_and_drop_by_offset(span, 300, 0).perform()

    time.sleep(30)

    print(bro.get_cookies())
except Exception as e:
    print(e)
finally:
    bro.close()
    bro.quit()  # 關閉整個瀏覽器

願君前程似錦，歸來仍是少年