1. 程式人生 > 其它 >python Selenium爬取實戰

python Selenium爬取實戰

### python Selenium爬取實戰

@[toc]
目標網站:

```
https://spa2.scrape.center/
```

這個網站是一個電影評分網站,採用selenium進行爬取

#### 第一步,我們先匯入相應的庫

```
import json

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging
from urllib.parse import urljoin
from os import makedirs
from os.path import exists
```

其實,我們這些庫在我們寫程式碼的時候,需要什麼就匯入什麼,不過,這裡就直接給出,瞭解一下這些庫。大部分是selenium的一些庫

#### 第二步:初始話瀏覽器

```
from selenium.webdriver import Chrome#匯入瀏覽器的包
#開啟瀏覽器
web=Chrome()
#開啟瀏覽器,請求網站
web.get('https://www.baidu.com')
```

可以看我的另外一篇文章

```
https://blog.csdn.net/Deng872347348/article/details/112688855#comments_20429011
```

為了避免程式碼的冗餘,我們程式碼採用的是def封裝函式的,減少程式碼的重寫,最好的是通過面向物件對程式碼進行編寫,這樣子可以最大化的避免程式碼的重寫。

#### 定義通用爬取的函式方法:

```
def scrape_page(url, condition, locator): # 定義通用爬取方法
logging.info('scraping %s', url)
try:
browser.get(url)
wait.until(condition(locator)) # 等待
except TimeoutException: # 報錯處理
logging.error('error occurred while scraping %s', url, exc_info=True)
```

#### 定義一個函式,獲取爬取的開始url:

```
def scrapge_index(page):
url = INDEX_URL.format(page=page) # 完善url
scrape_page(url, condition=EC.visibility_of_all_elements_located, # 元素可見方法
locator=(By.CSS_SELECTOR, '#index .item')) # CSS定位元素
```

#### 定義詳情頁:

```
def scrape_detail(url):
scrape_page(url, condition=EC.presence_of_element_located,
locator=(By.TAG_NAME, 'h2'))
```

#### 詳情頁詳細爬取:

```
def parse_detail():
url = browser.current_url # 獲取當前連結
name = browser.find_element_by_tag_name('h2').text
categories = [element.text for element in browser.find_elements_by_css_selector('.categories button span')]
cover = browser.find_element_by_css_selector('.cover').get_attribute('src')
score = browser.find_element_by_class_name('score').text
drama = browser.find_element_by_css_selector('.drama p').text
return {
'url': url,
'name': name,
'categories': categories,
'cover': cover,
'score': score,
'drama': drama
}
```

#### 獲取電影詳情連結

```
def parse_index(): # 獲取電影詳情連結
elements = browser.find_elements_by_css_selector('#index .item .name')
for element in elements:
href = element.get_attribute('href')
yield urljoin(INDEX_URL, href)
```

#### 儲存函式:

```
def save_data(data): # 儲存
name = data.get('name')
data_path = f'{RESULTS_DIR}/{name}.json'
json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
```

#### 執行的主函式:

```
def main():
try:
for page in range(1, TOTAL_PAGE + 1):
scrapge_index(page)
detail_urls = parse_index()
for detail_url in list(detail_urls):
logging.info('get detail url %s', detail_url)
scrape_detail(detail_url)
detail_data = parse_detail()
save_data(detail_data)
logging.info('detail data %s', detail_data)
finally:
browser.close()


if __name__ == '__main__':
main()
```

#### 完整程式碼 :

```
import json

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging
from urllib.parse import urljoin
from os import makedirs
from os.path import exists

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s') # 日誌配置

INDEX_URL = 'https://spa2.scrape.center/page/{page}' # 電影列表頁面
TIME_OUT = 10 # 等待時長
TOTAL_PAGE = 10 # 頁數

browser = webdriver.Chrome() # 初始化瀏覽器
wait = WebDriverWait(browser, TIME_OUT) # 配置頁面載入最長等待時間


def scrape_page(url, condition, locator): # 定義通用爬取方法
logging.info('scraping %s', url)
try:
browser.get(url)
wait.until(condition(locator)) # 等待
except TimeoutException: # 報錯處理
logging.error('error occurred while scraping %s', url, exc_info=True)


def scrapge_index(page):
url = INDEX_URL.format(page=page) # 完善url
scrape_page(url, condition=EC.visibility_of_all_elements_located, # 元素可見方法
locator=(By.CSS_SELECTOR, '#index .item')) # CSS定位元素


def scrape_detail(url):
scrape_page(url, condition=EC.presence_of_element_located,
locator=(By.TAG_NAME, 'h2'))


def parse_detail():
url = browser.current_url # 獲取當前連結
name = browser.find_element_by_tag_name('h2').text
categories = [element.text for element in browser.find_elements_by_css_selector('.categories button span')]
cover = browser.find_element_by_css_selector('.cover').get_attribute('src')
score = browser.find_element_by_class_name('score').text
drama = browser.find_element_by_css_selector('.drama p').text
return {
'url': url,
'name': name,
'categories': categories,
'cover': cover,
'score': score,
'drama': drama
}


def parse_index(): # 獲取電影詳情連結
elements = browser.find_elements_by_css_selector('#index .item .name')
for element in elements:
href = element.get_attribute('href')
yield urljoin(INDEX_URL, href)


RESULTS_DIR = 'results' # 定義目錄

exists(RESULTS_DIR) or makedirs(RESULTS_DIR) # 確定路徑在不在,沒有新建


def save_data(data): # 儲存
name = data.get('name')
data_path = f'{RESULTS_DIR}/{name}.json'
json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)


def main():
try:
for page in range(1, TOTAL_PAGE + 1):
scrapge_index(page)
detail_urls = parse_index()
for detail_url in list(detail_urls):
logging.info('get detail url %s', detail_url)
scrape_detail(detail_url)
detail_data = parse_detail()
save_data(detail_data)
logging.info('detail data %s', detail_data)
finally:
browser.close()


if __name__ == '__main__':
main()
```

如果你執行出行瀏覽器視窗閃退,程式碼出行下面的錯誤的話,說明你的那個chrome driver的版本和谷歌瀏覽器的版本不一樣

```
selenium.common.exceptions.SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 92
Current browser version is 99.0.4844.74 with binary path C:\Program Files\Google\Chrome\Application\chrome.exe
```

可以去谷歌瀏覽器設定裡面檢視自己的谷歌瀏覽器更新到那個版本了,

如圖

![](https://img2022.cnblogs.com/blog/2510292/202203/2510292-20220321104848984-2071845970.png)

![在這裡插入圖片描述](https://img-blog.csdnimg.cn/b9c748db48754b2ab81b4e962e6f2b7a.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBARGVuZzg3MjM0NzM0OA==,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center)


提供一個網站下載chrome driver的驅動

```
https://chromedriver.storage.googleapis.com/index.html
```

可以下載最近谷歌瀏覽器更新的程式碼,有mac 蘋果系統的,linux的版本