糗事百科正則爬蟲
阿新 • • 發佈:2017-10-22
.html == resp 加載 初始 main findall print 錯誤
參考博客:http://cuiqingcai.com/990.html
# -*- coding:utf-8 -*- import urllib import urllib2 import re page = 1 url = "https://www.qiushibaike.com/8hr/page/" + str(page) headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"} try: request = urllib2.Request(url, headers=headers) response= urllib2.urlopen(request) content = response.read() # 匹配有圖的帖子 #pattern = re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<img src="(.*?\.jpg)" .*?stats-vote.*?number">(\d+)‘,re.S) # re.S 多行匹配 # 匹配無圖的帖子 pattern = re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(\d+)‘,re.S) # re.S 多行匹配 items = re.findall(pattern,content) for item in items: print item[0], item[1].strip(), item[2] except urllib2.URLError, e: # 確定錯誤的屬性if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason
與用戶交互
# -*- coding:utf-8 -*- import urllib, urllib2 import re import thread import time stories = [] class Qsbk(): """定義一個醜事百科類""" def __init__(self): """初始方法""" self.url = "https://www.qiushibaike.com/8hr/page/" self.headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"} def get_page(self, page): """傳入某一頁索引的代碼""" fullurl = self.url + str(page) try: request = urllib2.Request(url=fullurl, headers=self.headers) response = urllib2.urlopen(request).read() self.get_page_items(response) except urllib2.URLError, e: if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason def get_page_items(self, response): """獲取段子列表""" global stories pattern = re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(\d+)‘,re.S) # re.S 多行匹配 items = re.findall(pattern,response) for item in items: stories.append(item[0].strip()+"\n"+ item[2].strip()+"\n"+ item[1].strip().replace("<br>", "").replace("<br/>", "")) def load_page(self, page): """如果當列表中少於10,則加載新一頁""" self.get_page(page) def get_one_story(self): """調用此方法,打印一個段子""" global stories print "--------------------------------------------------------------------------------------" print stories.pop(0) print "--------------------------------------------------------------------------------------\n" def main(): """控制函數""" print "段子加載中..." qsbk = Qsbk() page = 0 qsbk.load_page(page) while True: option = raw_input("按任意鍵看段,按q退出:") if "q" == option: break else: if len(stories) < 10: page += 1 qsbk.load_page(page) qsbk.get_one_story() if __name__ == "__main__": main()
糗事百科正則爬蟲