爬蟲從入門到入獄之入門（2）

阿新 • • 發佈：2022-12-06

1 css選擇器

bs4 可以通過遍歷，搜尋，css選擇器選擇標籤

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my p" class="title">asdfasdf<b id="bbb" class="boldest">The Dormouse's story</b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
 
"""

soup = BeautifulSoup(html_doc, 'lxml')
# res=soup.select('a')
# res=soup.select('#link1')   #代表id
# res=soup.select('.sister')   .代表class
# res=soup.select('body>p>a')
# 只需要會了css選擇，幾乎所有的解析器[bs4,lxml...],都會支援css和xpath


# res=soup.select('body>p>a:nth-child(2)')
# res=soup.select('body>p>a:nth-last-child(1)') 


# [attribute=value]
res=soup.select('a[href="http://example.com/tillie"]')
print(res)


'''
記住的：
    1  標籤名
    2  .類名
    3  #id號
    4 body a   body下子子孫孫中得a
    5 body>a  body下子的a，沒有孫
    6 其他的參照css選擇器
    
    
'''

2 selenium基本使用

# requests 傳送http請求獲取資料，獲取資料是xml使用bs4解析，解析出咱麼想要的資料
    -使用requests獲取回來的資料，跟直接在瀏覽器中看到的資料，可能不一樣
     
-requests不能執行js
    -如果使用requets，需要分析當次請求發出了多少請求，每個都要傳送一次，才能拼湊出網頁完整的資料
    
    
    
# selenium 操作瀏覽器，控制瀏覽器，模擬人的行為
# 人為點：功能測試
# 自動化測試（介面測試，壓力測試），網站，認為點，指令碼    appnium
# 測試開發 
selenium最初是一個自動化測試工具,而爬蟲中使用它主要是為了解決requests無法直接執行JavaScript程式碼的問題
selenium本質是通過驅動瀏覽器，完全模擬瀏覽器的操作，比如跳轉、輸入、點選、下拉等，來拿到網頁渲染之後的結果，可支援多種瀏覽器



# 使用：
    -安裝模組：pip3 install selenium
    -下載瀏覽器驅動：selenium操作瀏覽器，需要有瀏覽器(谷歌瀏覽器)，谷歌瀏覽器驅動
        -https://registry.npmmirror.com/binary.html?path=chromedriver/
        -瀏覽器版本對應的驅動
        106.0.5249.119    找到相應的驅動
        
    -寫程式碼測試
    from selenium import webdriver
    import time

    # 驅動放到環境變數中，就不用傳這個引數了
    # 開啟一個瀏覽器
    bro = webdriver.Chrome(executable_path='./chromedriver.exe')
    # 在位址列輸入 網站
    bro.get('http://www.baidu.com')

    time.sleep(3)
    bro.close()  # 關閉tab頁
    bro.quit()  # 關閉瀏覽器
    
    
    
    
# rpa:自動化流程機器人，認為做的體力活

3 無介面瀏覽器

from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options

# 驅動放到環境變數中，就不用傳這個引數了
# 開啟一個瀏覽器
chrome_options = Options()
# chrome_options.add_argument('window-size=1920x3000')  # 指定瀏覽器解析度
# chrome_options.add_argument('--disable-gpu')  # 谷歌文件提到需要加上這個屬性來規避bug
# chrome_options.add_argument('--hide-scrollbars')  # 隱藏滾動條, 應對一些特殊頁面
# chrome_options.add_argument('blink-settings=imagesEnabled=false')  # 不載入圖片, 提升速度
chrome_options.add_argument('--headless')  # 瀏覽器不提供視覺化頁面. linux下如果系統不支援視覺化不加這條會啟動失敗
# chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"  # 手動指定使用的瀏覽器位置
bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options)


# 在位址列輸入 網站
bro.get('https://www.jd.com/')
print(bro.page_source) # 瀏覽器中看到的頁面的內容

time.sleep(3)
bro.close()  # 關閉tab頁
bro.quit()  # 關閉瀏覽器

5 selenium其它用法

5.1 小案例，自動登入百度

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

bro = webdriver.Chrome(executable_path='./chromedriver.exe')

bro.get('http://www.baidu.com')
bro.implicitly_wait(10) # 等待，找一個標籤，如果標籤沒加載出來，等一會
bro.maximize_window() # 全屏
# 通過 a標籤文字內容查詢標籤的方式
a = bro.find_element(by=By.LINK_TEXT, value='登入')
# 點選標籤
a.click()

# 頁面中id唯一，如果有id，優先用id
input_name = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__userName')
# 輸入使用者名稱
input_name.send_keys('[email protected]')

time.sleep(1)

input_password = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__password')
input_password.send_keys('lqz12345')
time.sleep(1)

input_submit = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__submit')
# 點選
input_submit.click()
time.sleep(5)

bro.close()

5.1 獲取位置屬性大小，文字

# 查詢標籤
bro.find_element(by=By.ID,value='id號')
bro.find_element(by=By.LINK_TEXT,value='a標籤文字內容')
bro.find_element(by=By.PARTIAL_LINK_TEXT,value='a標籤文字內容模糊匹配')
bro.find_element(by=By.CLASS_NAME,value='類名')
bro.find_element(by=By.TAG_NAME,value='標籤名')
bro.find_element(by=By.NAME,value='屬性name')
# -----通用的----
bro.find_element(by=By.CSS_SELECTOR,value='css選擇器')
bro.find_element(by=By.XPATH,value='xpath選擇器')

# 獲取標籤位置，大小
print(code.location)
print(code.size)
-------
print(code.tag_name)
print(code.id)

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import base64
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://kyfw.12306.cn/otn/resources/login.html')
# # bro.get('https://www.jd.com/')
# bro.implicitly_wait(10)
# # bro.maximize_window()
#
# # 找到掃碼登入的標籤，搜尋標籤
# bro.find_element(by=By.ID,value='id號')
# bro.find_element(by=By.LINK_TEXT,value='a標籤文字內容')
# bro.find_element(by=By.PARTIAL_LINK_TEXT,value='a標籤文字內容模糊匹配')
# bro.find_element(by=By.CLASS_NAME,value='類名')
# bro.find_element(by=By.TAG_NAME,value='標籤名')
# bro.find_element(by=By.NAME,value='屬性name')
# # -----通用的----
# bro.find_element(by=By.CSS_SELECTOR,value='css選擇器')
# bro.find_element(by=By.XPATH,value='xpath選擇器')
#
#
a = bro.find_element(by=By.LINK_TEXT, value='掃碼登入')
# a = bro.find_element(by=By.CSS_SELECTOR, value='.login-hd-account>a')
a.click()
# code = bro.find_element(by=By.ID, value='J-qrImg')
code = bro.find_element(by=By.CSS_SELECTOR, value='#J-qrImg')
#
#
# # code = bro.find_element(by=By.CSS_SELECTOR, value='.logo_scene_img')
# # print(code)
#
# # 方案一：通過位置，和大小，截圖截出來
print(code.id)
print(code.location)
print(code.tag_name)
print(code.size)
# # 方案二：通過src屬性獲取到圖片
print(code.location)
print(code.size)
print(code.id)  # 不是標籤的id號
print(code.tag_name)  # 是標籤的名字
s = code.get_attribute('src')
print(s)
with open('code.png','wb') as f:
    res=base64.b64decode(s.split(',')[-1])
    f.write(res)

time.sleep(3)

bro.close()

5.2 等待元素被載入

# 程式碼執行很快，有些標籤還沒加載出來，直接取，取不到
# 等待
    -顯示等待：一般不用，需要指定等待哪個標籤，如果標籤很多，每個都要設定比較麻煩
    -隱士等待：
        bro.implicitly_wait(10)
        find找標籤的時候，如果找不到，等最多10s鍾

5.3 元素操作

# 點選
標籤.click()
# input寫文字
標籤.send_keys('文字')
#input清空文字
標籤.clear()

# 模擬鍵盤操作
from selenium.webdriver.common.keys import Keys
input_search.send_keys(Keys.ENTER)

5.4 執行js程式碼

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')

# 1 能幹很多事情，列印了cookie
# bro.execute_script('alert(document.cookie)')

# 2 滾動頁面，到最底部
# 一點點滑動
# for i in range(10):
#     y=400*(i+1)
#     bro.execute_script('scrollTo(0,%s)'%y)
#     time.sleep(1)
# 一次性直接滑動到最底部
bro.execute_script('scrollTo(0,document.body.scrollHeight)')

time.sleep(3)
bro.close()

5.5 切換選項卡

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')

# 使用js開啟新的選項卡
bro.execute_script('window.open()')

# 切換到這個選項卡上,剛剛開啟的是第一個
bro.switch_to.window(bro.window_handles[1])
bro.get('http://www.taobao.com')
time.sleep(2)
bro.switch_to.window(bro.window_handles[0])

time.sleep(3)
bro.close()
bro.quit()

5.6 瀏覽器前進後退

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')

time.sleep(2)
bro.get('https://www.taobao.com/')

time.sleep(2)
bro.get('https://www.baidu.com/')

# 後退一下
bro.back()
time.sleep(1)
# 前進一下
bro.forward()
time.sleep(3)
bro.close()

5.7 異常處理

from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
try:

except Exception as e:
    print(e)
finally:
    bro.close()