1. 程式人生 > >糗事百科正則爬蟲

糗事百科正則爬蟲

.html == resp 加載 初始 main findall print 錯誤

參考博客:http://cuiqingcai.com/990.html

# -*- coding:utf-8 -*- 
import urllib
import urllib2
import re

page = 1

url = "https://www.qiushibaike.com/8hr/page/" + str(page)
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"}
try:
    request = urllib2.Request(url, headers=headers)
    response 
= urllib2.urlopen(request) content = response.read() # 匹配有圖的帖子 #pattern = re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<img src="(.*?\.jpg)" .*?stats-vote.*?number">(\d+)‘,re.S) # re.S 多行匹配 # 匹配無圖的帖子 pattern = re.compile(
<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(\d+),re.S) # re.S 多行匹配 items = re.findall(pattern,content) for item in items: print item[0], item[1].strip(), item[2] except urllib2.URLError, e: # 確定錯誤的屬性
if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason

與用戶交互

# -*- coding:utf-8 -*-

import urllib, urllib2
import re
import thread
import time 
stories = []

class Qsbk():
    """定義一個醜事百科類"""
    def __init__(self):
        """初始方法"""
        self.url = "https://www.qiushibaike.com/8hr/page/"        
        self.headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"}

    def get_page(self, page):
        """傳入某一頁索引的代碼"""
        fullurl = self.url + str(page)
        try:
            request = urllib2.Request(url=fullurl, headers=self.headers)
            response = urllib2.urlopen(request).read()
            self.get_page_items(response)
        except urllib2.URLError, e:
            if hasattr(e, "code"):
                print e.code
            if hasattr(e, "reason"):
                print e.reason
       

    def get_page_items(self, response):
        """獲取段子列表"""
        global stories
        pattern = re.compile(<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(\d+),re.S) # re.S 多行匹配
        items = re.findall(pattern,response)
        for item in items:
            stories.append(item[0].strip()+"\n"+ item[2].strip()+"\n"+ item[1].strip().replace("<br>", "").replace("<br/>", ""))

    def load_page(self, page):
        """如果當列表中少於10,則加載新一頁"""
        self.get_page(page)

    def get_one_story(self):
        """調用此方法,打印一個段子"""
        global stories
        print "--------------------------------------------------------------------------------------"
        print stories.pop(0)
        print "--------------------------------------------------------------------------------------\n"        
    
def main():
    """控制函數"""
    print "段子加載中..."
    qsbk = Qsbk()
    page = 0       
    qsbk.load_page(page)
    while True:

        option = raw_input("按任意鍵看段,按q退出:")
        if "q" == option:
            break
        else:
            if len(stories) < 10:
                page += 1
                qsbk.load_page(page)
            qsbk.get_one_story()


if __name__ == "__main__":
    main()
     

糗事百科正則爬蟲