糗事百科實例
阿新 • • 發佈:2018-06-17
main img res apple def inpu code In pat
爬取糗事百科段子,頁面的URL是 http://www.qiushibaike.com/8hr/page/
-
使用requests獲取頁面信息,用XPath 做數據提取
-
獲取每個帖子裏的
用戶頭像鏈接
、用戶姓名
、段子內容
、點贊次數
和評論次數
1 # -*- coding:utf-8 -*- 2 import requests 3 from lxml import etree 4 5 def loadPage(url): 6 headers = { 7 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36‘, 8 ‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘} 9 try: 10 response = requests.get(url, headers=headers) 11 resHtml = response.text 12 html = etree.HTML(resHtml) 13 result = html.xpath(‘//div[contains(@id,"qiushi_tag")]‘) 14 for site in result: 15 item = {}16 imgUrl = site.xpath(‘./div/a/img/@src‘)[0].encode(‘utf-8‘) 17 #username = site.xpath(‘.//img/@alt‘)[0].encode(‘utf-8‘) 18 username = site.xpath(‘.//h2‘)[0].text 19 content = site.xpath(‘.//div[@class="content"]/span‘)[0].text.strip().encode(‘utf-8‘) 20 #投票次數 21 vote = site.xpath(‘.//i‘)[0].text 22 #print site.xpath(‘.//*[@class="number"]‘)[0].text 23 # 評論信息 24 comments = site.xpath(‘.//i‘)[1].text 25 print imgUrl, username, content, vote, comments 26 except Exception, e: 27 print e 28 29 def qiushiSpider(url, beginPage, endPage): 30 """ 31 作用:貼吧爬蟲調度器,負責組合處理每個頁面的url 32 url : 貼吧url的前部分 33 beginPage : 起始頁 34 endPage : 結束頁 35 """ 36 for page in range(beginPage, endPage + 1): 37 pn = page 38 fullurl = url + str(pn) 39 #print fullurl 40 loadPage(fullurl) 41 #print html 42 43 if __name__ == "__main__": 44 beginPage = int(raw_input("請輸入起始頁:")) 45 endPage = int(raw_input("請輸入結束頁:")) 46 #page = 1 47 url = ‘http://www.qiushibaike.com/8hr/page/‘ 48 qiushiSpider(url, beginPage, endPage)
保存到 json 文件內
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import urllib2 5 import json 6 from lxml import etree 7 8 9 def loadPage(url): 10 11 headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} 12 13 request = urllib2.Request(url, headers = headers) 14 html = urllib2.urlopen(request).read() 15 # 響應返回的是字符串,解析為HTML DOM模式 text = etree.HTML(html) 16 17 text = etree.HTML(html) 18 # 返回所有段子的結點位置,contains()模糊查詢方法,第一個參數是要匹配的標簽,第二個參數是標簽名部分內容 19 node_list = text.xpath(‘//div[contains(@id, "qiushi_tag")]‘) 20 21 items ={} 22 for node in node_list: 23 # xpath返回的列表,這個列表就這一個參數,用索引方式取出來,用戶名 24 username = node.xpath(‘.//img/@alt‘)[0] 25 # 圖片連接 26 image = node.xpath(‘.//div[@class="thumb"]//@src‘)#[0] 27 # 取出標簽下的內容,段子內容 28 content = node.xpath(‘.//div[@class="content"]/span‘)[0].text 29 # 取出標簽裏包含的內容,點贊 30 zan = node.xpath(‘.//i‘)[0].text 31 # 評論 32 comments = node.xpath(‘.//i‘)[1].text 33 34 items = { 35 "username" : username, 36 "image" : image, 37 "content" : content, 38 "zan" : zan, 39 "comments" : comments 40 } 41 42 with open("qiushi.json", "a") as f: 43 f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n") 44 45 def qiushiSpider(url, beginPage, endPage): 46 """ 47 作用:貼吧爬蟲調度器,負責組合處理每個頁面的url 48 url : 貼吧url的前部分 49 beginPage : 起始頁 50 endPage : 結束頁 51 """ 52 for page in range(beginPage, endPage + 1): 53 pn = page 54 fullurl = url + str(pn) 55 #print fullurl 56 loadPage(fullurl) 57 #print html 58 59 if __name__ == "__main__": 60 beginPage = int(raw_input("請輸入起始頁:")) 61 endPage = int(raw_input("請輸入結束頁:")) 62 #page = 1 63 url = ‘http://www.qiushibaike.com/8hr/page/‘ 64 qiushiSpider(url, beginPage, endPage)
糗事百科實例