1. 程式人生 > 程式設計 >selenium+PhantomJS爬取豆瓣讀書

selenium+PhantomJS爬取豆瓣讀書

本文例項為大家分享了selenium+PhantomJS爬取豆瓣讀書的具體程式碼,供大家參考,具體內容如下

獲取關於Python的全部書籍資訊;
通過程式碼測試 request攜帶‘User-Agent'及 ‘data'資料資訊的方式均無法獲取到相關資訊,獲取資料時,部分資料為空,導致獲取過程中報錯,無法獲取全部資料,初步判定豆瓣讀書的反爬機制較為嚴格;通過selenium 模擬瀏覽器請求的方法測試後發現,可利用 selenium 方法請求獲取資料;

#匯入需要的模組
from selenium import webdriver
import time
from lxml import etree
import pymysql
import re

#建立一個函式
def my_browers(url,page):

  # 獲取瀏覽器物件
  browers = webdriver.PhantomJS(executable_path=r'd:\Desktop\pythonjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
  
  # 用瀏覽器發起請求
  browers.get(url)
  
  #休息兩秒,頻率低一點,爬的時間久一點,安全就多一點
  time.sleep(2)
  
   # 獲取頁面資訊
  html = browers.page_source
  
  # 呼叫頁面解析函式
  parse_html(html)
  
# 解析頁面資訊
def parse_html(html):

  # 生成一個xpath物件
  html = etree.HTML(html)

  # 獲取所有的書籍資訊列表
  books = html.xpath('//div[contains(@class,"sc-bZQynM")]')

  # 遍歷每一本書籍 然後拿到我們想要的資料
  for book in books:
    # 建立一個存書字典存資料用
    book_dict = {}

    # 獲取封面資訊
    pic = book.xpath('//img/@src')
    if pic:
      book_dict['pic'] = pic[0]
    else:
      book_dict['pic'] = ''
    # print(pic)

    # 獲取書名
    book_name = book.xpath('//div[@class="title"]/a/text()')
    # print(book_name)
    if book_name:
      book_name = book_name[0]
      # 刪除書名中最後出現的引號,
      #由於存資料庫的時候書名最後面的引號會導致資料庫報錯,刪除可以使程式碼更健壯
      if '"' in book_name:
        pattern = re.compile(r'"')
        book_name = pattern.sub('',book_name)
      if "'" in book_name:
        pattern = re.compile(r"'")
        book_name = pattern.sub('',book_name)
        # 刪除書名中最後出現的\,存資料的時候書名最後的\會把sql語句最後的引號轉義,
        #刪除可以使程式碼更健壯
      if '\\' in book_name:
        book_name = book_name[:-1]
      book_dict['book_name'] = book_name
    else:
      book_dict['book_name'] = ''

    # 獲取書籍詳情連線
    book_url = book.xpath('//div[@class="title"]/a/@href')
    if book_url:
      book_dict['book_url'] = book_url[0]
    else:
      book_dict['book_url'] = ''

    # 獲取評分資訊
    score_book = book.xpath('//span[@class="rating_nums"]/text()')
    if score_book:
      book_dict['score_book'] = score_book[0]
    else:
      book_dict['score_book'] = ''

    # 獲取出版社資訊
    book_detail = book.xpath('//div[@class="meta abstract"]/text()')
    if book_detail:
      # 刪除書詳情中最後出現的引號;
      book_detail = book_detail[0]
      if "'" in book_detail:
        pattern = re.compile(r"'")
        book_detail = pattern.sub('',book_detail)

      book_dict['book_detail'] = book_detail
    else:
      book_dict['book_detail'] = ''
    print(book_dict)

    # 呼叫資料庫函式
    insert_mysql(book_dict)

# 插入資料庫
def insert_mysql(book_dict):
  # 連線資料庫
  conn = pymysql.connect('localhost','root','test',charset='utf8')

  # 建立操作資料庫的物件
  cursor = conn.cursor()

  pic = book_dict['pic']
  book_name = book_dict['book_name']
  book_url = book_dict['book_url']
  score = book_dict['score_book']
  book_detail = book_dict['book_detail']

  sql = f"insert into python_book (pic,book_name,book_url,score,book_detail) " \
     f"VALUE ('{pic}','{book_name}','{book_url}','{score}','{book_detail}')"

  # 執行並提交
  cursor.execute(sql)
  conn.commit()


if __name__ == '__main__':
  for i in range(0,199):
    print('=================下載第{}頁========================'.format(i + 1))
    page = i * 15
    base_url = 'https://book.douban.com/subject_search?search_text=python&cat=1001&start={}'.format(page)
    my_browers(base_url,page)

以上就是本文的全部內容,希望對大家的學習有所幫助,也希望大家多多支援我們。