1. 程式人生 > 其它 >使用selenium用ISBN在京東上批量爬取書籍資訊

使用selenium用ISBN在京東上批量爬取書籍資訊

首先讀取 .xls 檔案,然後根據表格裡的ISBN在京東上挨個搜尋,再把需要的資訊從網頁上提取出來儲存在另一個檔案裡。

每次執行 .py 檔案後開啟瀏覽器會彈出登入頁面(30s),在此期間手動登入,30秒後開始爬取。

#!/usr/bin/python
# -*- coding: UTF-8 -*-

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import json
from selenium.webdriver.common.keys import
Keys from lxml import etree import xlrd import xlwt import datetime from time import sleep # options = webdriver.ChromeOptions() # options.add_argument('--headless') # options.add_argument('--no-sandbox') # options.add_argument('--disable-gpu') # options.add_argument('--disable-dev-shm-usage') # driver = webdriver.Chrome(chrome_options=options)
data_dict = tDict = {'ISBN': '0000000000000', '出版時間': '0000-00-00', '版次': '1'} driver = webdriver.Chrome() def test01_login(): driver = webdriver.Chrome() driver.get( "https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Fwww.jd.com%2F") sleep(30) # 手動登陸一次 cookies = driver.get_cookies()
# 將 cookies 寫入檔案 with open("cookies.txt", "w") as f: json.dump(cookies, f) def singleData(para): try: driver.get('https://www.jd.com/') # 載入 cookies資訊 with open("cookies.txt", "r") as f: cookies = json.load(f) for cookie in cookies: driver.add_cookie(cookie) driver.find_element_by_id("key").send_keys(para) driver.find_element_by_xpath('//*[@id="search"]/div/div[2]/button/i').click() sleep(3) html = driver.page_source h = etree.HTML(html) # 在搜尋到的結果中僅取一條連結 driver.get("https:" + h.xpath('//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href')[0]) html = driver.page_source h = etree.HTML(html) # 獲得所爬資訊 list = h.xpath('//div/ul[@class="parameter2 p-parameter-list"]/li/text()') for li in list: if li.lstrip().startswith('ISBN'): # 消去左邊的空格,然後檢測以“ISBN”開頭的一條 data_dict["ISBN"] = li if li.lstrip().startswith('出版時間'): data_dict["出版時間"] = li if li.lstrip().startswith('版次'): data_dict["版次"] = li # driver.close() return data_dict except Exception as e: # error occurred, log 'e', etc. with open("exception.txt", "a", encoding="utf-8") as f: f.write(str(e) + "\n") f.close() readbook = xlrd.open_workbook(r'table.xls') SheetOfInput = readbook.sheet_by_name('Sheet1') nrows = SheetOfInput.nrows # 獲取最大行 writebook = xlwt.Workbook(encoding="utf8") # 開啟一個excel SheetOfOutput = writebook.add_sheet('test') # 在開啟的excel中新增一個sheet test01_login() for gi in range(0,nrows): try: lng = SheetOfInput.cell(gi, 4).value # 獲取i行3列的表格值 tDict = singleData(lng) SheetOfOutput.write(gi, 0, tDict["ISBN"]) SheetOfOutput.write(gi, 1, tDict["出版時間"]) SheetOfOutput.write(gi, 2, tDict["版次"]) writebook.save('answer.xls') print('tDict["ISBN"] = %s, tDict["出版時間"] = %s, tDict["版次"] = %s, gi = %d. ' %(tDict["ISBN"], tDict["出版時間"], tDict["版次"], gi)) except Exception as e: # error occurred, log 'e', etc. with open("exception.txt", "a", encoding="utf-8") as f: f.write(str(e) + "\n") f.close() driver.quit() ####################################### # 定義一個爬蟲函式,針對單條isbn進行爬取,返回一個字典 # 開啟table,讀取isbn號, # 呼叫定義的函式,然後將返回的字典寫入table