1. 程式人生 > >爬取豆瓣古典文學(數據庫存儲)

爬取豆瓣古典文學(數據庫存儲)

-s cursor .text code lxml qlite mat com etc

代碼如下:

  1 # coding:utf-8
  2 import cPickle
  3 import random
  4 import requests
  5 from lxml import etree
  6 import time
  7 import re
  8 import sys
  9 import codecs
 10 import sqlite3
 11 
 12 class Spider:
 13     def __init__(self):
 14         self.con = sqlite3.connect(rBookInformation.db
) 15 self.cur = self.con.cursor() 16 self.home = https://book.douban.com/tag/%E5%8F%A4%E5%85%B8%E6%96%87%E5%AD%A6 17 self.Referer = https://book.douban.com/ 18 self.user_agent_list = [] 19 self.books_list = [] 20 with open(user_agent.txt, rb) as f:
21 self.user_agent_list = cPickle.load(f) 22 23 def GetHeaders(self): 24 UserAgent = random.choice(self.user_agent_list) 25 headers = {Referer: self.Referer, User-Agent: UserAgent} 26 return headers 27 28 def SaveBook(self,info): 29 sql =
INSERT INTO BookInfo VALUES(?,?,?,?,?) 30 info_list = (info["Name"],info["Author"],info["Rating"],info["ContentIntro"],info["AuthorIntro"]) 31 self.cur.execute(sql, info_list) 32 self.con.commit() 33 34 def Crawl(self): 35 html = requests.get(self.home,headers=self.GetHeaders()).text 36 html_tree = etree.HTML(html) 37 booksList = html_tree.xpath(/html/body/div[3]/div[1]/div/div[1]/div/ul/li) 38 num = 0 39 for book in booksList: 40 time.sleep(1) 41 bookUrl = book.xpath(div[2]/h2/a)[0].get(href) 42 pageHtml = requests.get(bookUrl,headers=self.GetHeaders()).text 43 page_tree = etree.HTML(pageHtml) 44 book_info = self.GetPage(page_tree) 45 print book_info[Name] 46 self.SaveBook(book_info) 47 # self.books_list.append(book_info) 48 # f = codecs.open(‘text.txt‘,‘a‘,encoding=‘utf-8‘) 49 # f.write(book_info[‘AuthorIntro‘]) 50 # f.close() 51 # print book_info[‘AuthorIntro‘] 52 num = num+1 53 if num==5: 54 break 55 56 57 def GetPage(self, page_tree): 58 book_info = {} 59 try: 60 Name = self.GetName(page_tree) 61 book_info[Name] = Name 62 except: 63 book_info[Name] = ‘‘ 64 try: 65 Author = self.GetAuthor(page_tree) 66 book_info[Author] = Author 67 except: 68 book_info[Author] = ‘‘ 69 try: 70 Rating = self.GetRating(page_tree) 71 book_info[Rating] = Rating 72 except: 73 book_info[Rating] = ‘‘ 74 try: 75 ContentIntro = self.GetContentIntro(page_tree) 76 book_info[ContentIntro] = ContentIntro 77 except: 78 book_info[ContentIntro] = ‘‘ 79 try: 80 AuthorIntro = self.GetAuthorIntro(page_tree) 81 book_info[AuthorIntro] = AuthorIntro 82 except: 83 book_info[AuthorIntro] = ‘‘ 84 85 86 return book_info 87 88 def GetName(self, page_tree): 89 return page_tree.xpath(/html/body/div[3]/h1/span)[0].text 90 91 def GetAuthor(self,page_tree): 92 author_list = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[1]/a) 93 result = ‘‘ 94 if len(author_list) is not 0: 95 list = [] 96 for author in author_list: 97 list.append(author.text.strip()) 98 result = /.join(list) 99 else: 100 result = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/a)[0].text.strip() 101 return re.sub(r\s+, ,result) 102 103 104 def GetRating(self, page_tree): 105 return page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[2]/div/div[2]/strong)[0].text.strip() 106 107 def GetContentIntro(self, page_tree): 108 para_div = page_tree.xpath(//*[@id="link-report"]//div[@class="intro"]) 109 result = ‘‘ 110 if len(para_div) is not 0: 111 para_para = para_div[len(para_div)-1].xpath(p) 112 for para in para_para: 113 result = result+\t+para.text+\n 114 return result 115 116 def GetAuthorIntro(self, page_tree): 117 para_div = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[3]/div[@class="indent "]//div[@class="intro"]) 118 result = ‘‘ 119 if len(para_div) is not 0: 120 para_para = para_div[len(para_div) - 1].xpath(p) 121 for para in para_para: 122 result = result + \t + para.text + \n 123 return result 124 125 # def GetCatalogue(self, page_tree): 126 # pass 127 # 128 # def GetTag(self, page_tree): 129 # pass 130 # 131 # def GetShortCommentary(self, page_tree): 132 # pass 133 134 if __name__ == __main__: 135 s = Spider() 136 s.Crawl()

爬取豆瓣古典文學(數據庫存儲)