使用selenium用ISBN在京東上批量爬取書籍資訊

阿新 • • 發佈：2021-07-19

首先讀取 .xls 檔案，然後根據表格裡的ISBN在京東上挨個搜尋，再把需要的資訊從網頁上提取出來儲存在另一個檔案裡。

每次執行 .py 檔案後開啟瀏覽器會彈出登入頁面（30s），在此期間手動登入，30秒後開始爬取。

#!/usr/bin/python
# -*- coding: UTF-8 -*-

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import json
from selenium.webdriver.common.keys import 
 Keys
from lxml import etree
import xlrd
import xlwt
import datetime
from time import sleep


# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-gpu')
# options.add_argument('--disable-dev-shm-usage')
# driver = webdriver.Chrome(chrome_options=options) 


data_dict = tDict = {'ISBN': '0000000000000', '出版時間': '0000-00-00', '版次': '1'}

driver = webdriver.Chrome()

def test01_login():
    driver = webdriver.Chrome()
    driver.get(
        "https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Fwww.jd.com%2F")

    sleep(30)  # 手動登陸一次

    cookies = driver.get_cookies()
     
# 將 cookies 寫入檔案
    with open("cookies.txt", "w") as f:
        json.dump(cookies, f)

def singleData(para):
    try:
        driver.get('https://www.jd.com/')

        # 載入 cookies資訊
        with open("cookies.txt", "r") as f:
            cookies = json.load(f)
            for cookie in cookies:
                driver.add_cookie(cookie)

        driver.find_element_by_id("key").send_keys(para)

        driver.find_element_by_xpath('//*[@id="search"]/div/div[2]/button/i').click()

        sleep(3)
        html = driver.page_source
        h = etree.HTML(html)
        # 在搜尋到的結果中僅取一條連結
    
        driver.get("https:" + h.xpath('//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href')[0])
        
        html = driver.page_source
        h = etree.HTML(html)

        # 獲得所爬資訊
        list = h.xpath('//div/ul[@class="parameter2 p-parameter-list"]/li/text()')

        for li in list:
            if li.lstrip().startswith('ISBN'):  # 消去左邊的空格，然後檢測以“ISBN”開頭的一條
                data_dict["ISBN"] = li
            if li.lstrip().startswith('出版時間'):
                data_dict["出版時間"] = li
            if li.lstrip().startswith('版次'):
                data_dict["版次"] = li

        # driver.close()
        return data_dict


    except Exception as e:
        # error occurred, log 'e', etc.
        with open("exception.txt", "a", encoding="utf-8") as f:
            f.write(str(e) + "\n")
            f.close()


readbook = xlrd.open_workbook(r'table.xls')
SheetOfInput = readbook.sheet_by_name('Sheet1') 
nrows = SheetOfInput.nrows  # 獲取最大行

writebook = xlwt.Workbook(encoding="utf8")  # 開啟一個excel
SheetOfOutput = writebook.add_sheet('test')  # 在開啟的excel中新增一個sheet

test01_login()


for gi in range(0,nrows):
    try:
        lng = SheetOfInput.cell(gi, 4).value  # 獲取i行3列的表格值
        tDict = singleData(lng)

        SheetOfOutput.write(gi, 0, tDict["ISBN"])
        SheetOfOutput.write(gi, 1, tDict["出版時間"])
        SheetOfOutput.write(gi, 2, tDict["版次"])
        writebook.save('answer.xls') 
        print('tDict["ISBN"] = %s, tDict["出版時間"] = %s, tDict["版次"] = %s, gi = %d. ' %(tDict["ISBN"], tDict["出版時間"], tDict["版次"], gi))
    except Exception as e:
        # error occurred, log 'e', etc.
        with open("exception.txt", "a", encoding="utf-8") as f:
            f.write(str(e) + "\n")
            f.close()

driver.quit()



#######################################
# 定義一個爬蟲函式，針對單條isbn進行爬取，返回一個字典
# 開啟table，讀取isbn號，
# 呼叫定義的函式，然後將返回的字典寫入table

使用selenium用ISBN在京東上批量爬取書籍資訊

首先讀取 .xls 檔案，然後根據表格裡的ISBN在京東上挨個搜尋，再把需要的資訊從網頁上提取出來儲存在另一個檔案裡。

爬蟲實戰：批量爬取京東內衣圖片（自動爬取多頁，非一頁）

技術標籤：爬蟲爬蟲python 做下男生想做的事，爬取大量妹子內衣圖。作者：電氣-餘登武

selenium，獲取cookie，xpath，超級鷹，京東，12306爬取

目錄 selenium使用-->可以使用程式碼控制模擬人操作瀏覽器 selenium操作瀏覽器無頭瀏覽器

requests re正則匹配批量爬取 ip 經緯度存入mysql中

# 安裝 pip install pymysql pip install requests import time import random import json import pymysql db = pymysql.connect(host=\'xxx\',user=\'xxx\',password=\'xxx\',database=\'xxx\')