爬蟲之爬取豆瓣圖書排行榜
阿新 • • 發佈:2019-02-03
from bs4 import BeautifulSoup from lxml import etree import requests import time import os if __name__=='__main__': # download_url='https://book.douban.com/top250?start={}' head={} head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' '''使用soup.select對html檔案進行選擇''' # rank=0 # for j in range(0,250,25): # download_url = 'https://book.douban.com/top250?start={}'.format(j) # res=requests.get(url = download_url, headers = head) # res.encoding='uft-8' # soup=BeautifulSoup(res.text,'html.parser') # book_table = soup.select('div.indent') # for i in range(25): # rank+=1 # title = soup.select('div.pl2 a')[i].text.strip().split() # title = '{}{}{}'.format(title[0], title[1], title[2]) if len(title) == 3 else title[0] # try: # word=soup.select('span.inq')[i].text.strip() # except: # word=None # publish = soup.select('td p.pl')[i].text.strip() # score = soup.select('span.rating_nums')[i].text.strip() # print("{}: {} / {} / {} / {}".format(rank, title, score,publish,word)) '''使用xpath對html檔案進行編輯''' rank=0 for j in range(0, 250, 25): download_url = 'https://book.douban.com/top250?start={}'.format(j) res=requests.get(url = download_url, headers = head).text s=etree.HTML(res) file=s.xpath('// *[ @ id = "content"] / div / div[1] / div / table') time.sleep(2) for div in file: rank+=1 title_master=div.xpath('./ tr / td[2] / div[1] / a / @title') try: title_slave=div.xpath('./ tr / td[2] / div[1] / a / span/text()') title=title_master[0]+title_slave[0] except: title=title_master[0] score=div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0] evale=div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip(')').strip() try: word = div.xpath('./tr / td[2] / p[2] / span/text()')[0] except: word=None publish=div.xpath('./ tr / td[2] / p[1]/text()')[0] # write_book_info={ # 'rank':rank, # 'title':title, # 'score':score, # 'evale':evale, # 'publish':publish, # 'word':word # } # # print("{}: {} / {} / {} / {} / {}".format(rank, title, score, evale, publish, word)) # print(write_book_info) with open('book_info.txt','a',encoding='utf-8') as f: f.write("{}: {} / {}/ {} / {} / {} \n".format(rank, title, score, evale, publish, word)) print('finish saving!')