1. 程式人生 > >爬蟲Spider--爬取貼吧

爬蟲Spider--爬取貼吧

輸入起始頁的靈活爬取

# - * - coding: UTF-8 - * -
"""
import urllib2

url = "http://www.baidu.com"
#IE 9.0 的 User-Agent,包含在 ua_header裡
ua_header = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
#  url 連同 headers,一起構造Request請求,這個請求將附帶 IE9.0 瀏覽器的User-Agent
request = urllib2.Request(url, headers = ua_header)
# 向伺服器傳送這個請求
response = urllib2.urlopen(request)

html = response.read()
print html



import urllib      #負責url編碼處理
import urllib2

url = "http://www.baidu.com/s"
word = {"wd":"華育興業"}
word = urllib.urlencode(word) #轉換成url編碼格式(字串)
newurl = url + "?" + word    # url首個分隔符就是 ?

headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}

request = urllib2.Request(newurl, headers=headers)

response = urllib2.urlopen(request)
print response.read()
"""



# 爬蟲貼吧例子
import sys
import urllib
import urllib2

if sys.getdefaultencoding() != 'utf-8':
    reload(sys)
    sys.setdefaultencoding('utf-8')

def writeFile(html, filename):
    """
        作用:儲存伺服器響應檔案到本地磁碟檔案裡
        html: 伺服器響應檔案
        filename: 本地磁碟檔名
    """
    # print "正在儲存" + filename
    # Fs=open(filename, 'w+')
    # Fs.write(html)
    # Fs.close()
    with open("d://123/"+filename.encode('gb2312'), 'w') as f:
         f.write(html)
    print "-" * 20


def tiebaSpider(url, beginPage, endPage):
    """
        作用:負責處理url,分配每個url去傳送請求
        url:需要處理的第一個url
        beginPage: 爬蟲執行的起始頁面
        endPage: 爬蟲執行的截止頁面
    """
    for page in range(beginPage, endPage + 1):
        pn = (page - 1) * 50

        filename = "第"+str(page) + "頁.html"
        # 組合為完整的 url,並且pn值每次增加50
        fullurl = url + "&pn=" + str(pn)
        #print fullurl

        # 呼叫loadPage()傳送請求獲取HTML頁面
        html = loadPage(fullurl, filename)
        # 將獲取到的HTML頁面寫入本地磁碟檔案
        writeFile(html, filename)

def loadPage(url, filename):
    '''
        作用:根據url傳送請求,獲取伺服器響應檔案
        url:需要爬取的url地址
        filename: 檔名
    '''
    print "正在下載" + filename

    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}

    request = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(request)
    return response.read()


# 模擬 main 函式
if __name__ == "__main__":
    kw = raw_input("請輸入需要爬取的貼吧:")
    # 輸入起始頁和終止頁,str轉成int型別
    beginPage = int(raw_input("請輸入起始頁:"))
    endPage = int(raw_input("請輸入終止頁:"))

    url = "http://tieba.baidu.com/f?"
    key = urllib.urlencode({"kw": kw})

    # 組合後的url示例:http://tieba.baidu.com/f?kw=lol
    url = url + key
    tiebaSpider(url, beginPage, endPage)

2.

# - * - coding: UTF-8 - * -
import sys
import urllib
import urllib2
from lxml import etree


if sys.getdefaultencoding() != 'utf-8':
    reload(sys)
    sys.setdefaultencoding('utf-8')

class Spider:
    def __init__(self):
        self.beginPage = int(raw_input("請輸入起始頁:"))
        self.endPage = int(raw_input("請輸入終止頁:"))

        self.url = 'http://duanziwang.com/category/%E6%90%9E%E7%AC%91%E5%9B%BE/'
        self.ua_header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
        #self.userName = 1

    def tiebaSpider(self):
        for page in range(self.beginPage, self.endPage + 1):
            myUrl = self.url + str(page) + '/'
            links = self.loadImages(myUrl)

    def loadImages(self, link):
        req = urllib2.Request(link, headers=self.ua_header)
        html = urllib2.urlopen(req).read()
        selector = etree.HTML(html)
        imagesLinks = selector.xpath('//div/p/img/@src')
        imagesNames = selector.xpath('//div/p/img/@title')
        # 依次取出圖片路徑,下載儲存
        for (imageslink, imagesname)in zip (imagesLinks, imagesNames):

            self.writeImages(imageslink,imagesname)

    def writeImages(self, imagesLink,imagesName):
        '''
            將 images 裡的二進位制內容存入到 userNname 檔案中
        '''
        print "正在儲存檔案 %s ..." % imagesName
        # 1. 開啟檔案,返回一個檔案物件
        with open("d:/124/"+imagesName, "wb") as file:

        # 2. 獲取圖片裡的內容
            image = urllib2.urlopen(imagesLink).read()

        # 3. 呼叫檔案物件write() 方法,將page_html的內容寫入到檔案裡
            file.write(image)
            file.close()
    # 模擬 main 函式
if __name__ == "__main__":

    # 首先建立爬蟲物件
    mySpider = Spider()
    # 呼叫爬蟲物件的方法,開始工作
    mySpider.tiebaSpider()


3.爬取段子網

# -*- coding: utf-8 -*-
import urllib2
import re

class Spider:
    """
        段子網
    """
    def loadPage(self, page):
        """
            @brief 定義一個url請求網頁的方法
            @param page 需要請求的第幾頁
            @returns 返回的頁面html
        """

        url = "http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/" + str(page)+ "/"
        #User-Agent頭
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

        headers = {'User-Agent': user_agent}
        req = urllib2.Request(url, headers = headers)
        response = urllib2.urlopen(req)
        html = response.read()
        pattern = re.compile(r'(<p>|</p>)')
        html = pattern.sub("",html)
        pattern = re.compile(r'<br>')
        html = pattern.sub("\\n", html)

        pattern = re.compile(r'<div class="post-content">(.*?)</div>', re.S)
        item_list = pattern.findall(html)

        return item_list

    def printOnePage(self, item_list, page):
        """
            @brief 處理得到的段子列表
            @param item_list 得到的段子列表
            @param page 處理第幾頁
        """
        print item_list
        print "******* 第 %d 頁 爬取完畢...*******" % page
        for item in item_list:
            self.writeToFile(item)

    def writeToFile(self, text):
        '''
            @brief 將資料追加寫進檔案中
            @param text 檔案內容
        '''
        with open("d:/124/duanzi.txt", 'a') as myFile:
            myFile.write(text)
            myFile.write("\r\n-----------------------------------------------------")
            myFile.close()

    def doWork(self):
        '''
        讓爬蟲開始工作
        '''
        while self.enable:
            try:
                item_list = self.loadPage(self.page)
            except urllib2.URLError, e:
                print e.reason
                continue

            # 對得到的段子item_list處理
            self.printOnePage(item_list, self.page)
            self.page += 1  # 此頁處理完畢,處理下一頁
            print "按回車繼續..."
            print "輸入 quit 退出"
            command = raw_input()
            if (command == "quit"):
                break


if __name__ == '__main__':
    """
        ======================
            段子網小爬蟲
        ======================
    """
    #定義一個Spider物件
    mySpider = Spider()
    mySpider.page = 1
    mySpider.enable = True
    mySpider.doWork()