1. 程式人生 > >爬蟲之爬取豆瓣圖書排行榜

爬蟲之爬取豆瓣圖書排行榜

from bs4 import BeautifulSoup
from lxml import etree
import requests
import time
import os
if __name__=='__main__':
    # download_url='https://book.douban.com/top250?start={}'
    head={}
    head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'

    '''使用soup.select對html檔案進行選擇'''
    # rank=0
    # for j in range(0,250,25):
    #     download_url = 'https://book.douban.com/top250?start={}'.format(j)
    #     res=requests.get(url = download_url, headers = head)
    #     res.encoding='uft-8'
    #     soup=BeautifulSoup(res.text,'html.parser')
    #     book_table = soup.select('div.indent')
    #     for i in range(25):
    #         rank+=1
    #         title = soup.select('div.pl2 a')[i].text.strip().split()
    #         title = '{}{}{}'.format(title[0], title[1], title[2]) if len(title) == 3 else title[0]
    #         try:
    #             word=soup.select('span.inq')[i].text.strip()
    #         except:
    #             word=None
    #         publish = soup.select('td p.pl')[i].text.strip()
    #         score = soup.select('span.rating_nums')[i].text.strip()
    #         print("{}: {} / {} / {} / {}".format(rank, title, score,publish,word))

    '''使用xpath對html檔案進行編輯'''
    rank=0
    for j in range(0, 250, 25):
        download_url = 'https://book.douban.com/top250?start={}'.format(j)
        res=requests.get(url = download_url, headers = head).text
        s=etree.HTML(res)
        file=s.xpath('// *[ @ id = "content"] / div / div[1] / div / table')
        time.sleep(2)
        for div in file:
            rank+=1
            title_master=div.xpath('./ tr / td[2] / div[1] / a / @title')
            try:
                title_slave=div.xpath('./ tr / td[2] / div[1] / a / span/text()')
                title=title_master[0]+title_slave[0]
            except:
                title=title_master[0]
            score=div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
            evale=div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip(')').strip()
            try:
                word = div.xpath('./tr / td[2] / p[2] / span/text()')[0]
            except:
                word=None
            publish=div.xpath('./ tr / td[2] / p[1]/text()')[0]
            # write_book_info={
            #     'rank':rank,
            #     'title':title,
            #     'score':score,
            #     'evale':evale,
            #     'publish':publish,
            #     'word':word
            # }
            # # print("{}: {} / {} / {} / {} / {}".format(rank, title, score, evale, publish, word))
            # print(write_book_info)
            with open('book_info.txt','a',encoding='utf-8') as f:
                f.write("{}: {} / {}/ {} / {} / {} \n".format(rank, title, score, evale, publish, word))
    print('finish saving!')