1. 程式人生 > >百度網頁貼吧批量爬取

百度網頁貼吧批量爬取

from urllib import parse
import urllib.request
import threading
#使用多執行緒爬取

def loadPage(url,filename):
    '''
        作用:根據url傳送請求,獲取伺服器響應檔案
        url:需要爬取的url地址
        filename:檔名
    '''
    print('正在下載',filename)

    ua_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
    #構建請求物件
    request = urllib.request.Request(url,headers=ua_headers)
    respond = urllib.request.urlopen(request) #返回類檔案物件
    print(respond.getcode())
    html = respond.read()

    print('正在儲存', filename)
    with open(filename, 'w') as f:
        f.write(html)

def writePage(html,filename):
    '''
        作用:將html頁面寫入本地磁碟中
        html:所爬取的網頁
        filename:儲存的檔名
    '''
    print('正在儲存',filename)
    with open(filename,'w') as f:
        f.write(html)


def webSpider(url,begin_page,end_page):
    '''
        作用: 負責處理url,分配每個url去傳送請求
        url:需要去處理的第一個url
        begin_page:起始頁
        end_page:終止頁
    '''
    for page in range(begin_page,end_page+1):
        pn = (page-1)*50

        filename = '第' + str(page) + '頁.html'

        full_url = url+'&pn='+str(pn) #組合完整的url
        #print(full_url)
        t = threading.Thread(target=loadPage,args=(full_url,filename))
        t.start()

if __name__ == '__main__':
    while True:
        kw =  input('請輸入你要爬取的貼吧關鍵字:').strip()
        beginPage = input('起始頁:').strip()
        endPage = input('終止頁:').strip()

        if (kw and beginPage and endPage):
            word = parse.urlencode({'kw':kw}) #轉換為url編碼
            url = 'http://tieba.baidu.com/f?'
            new_url = url+word #組合後的url,示例;http://tieba.baidu.com/f?kw=lol

            webSpider(new_url,int(beginPage),int(endPage))
            break
from urllib import parse
import urllib.request
from multiprocessing import Pool
#使用程序池爬取

def loadPage(url,filename):
    '''
        作用:根據url傳送請求,獲取伺服器響應檔案
        url:需要爬取的url地址
        filename:檔名
    '''
    print('正在下載',filename)

    ua_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
    #構建請求物件
    request = urllib.request.Request(url,headers=ua_headers)
    respond = urllib.request.urlopen(request) #返回類檔案物件
    print(respond.getcode())
    html = respond.read()

    '''儲存檔案到本地磁碟中'''
    print('正在儲存', filename)
    with open(filename, 'wb') as f:
        f.write(html)

def webSpider(url,begin_page,end_page):
    '''
        作用: 負責處理url,分配每個url去傳送請求
        url:需要去處理的第一個url
        begin_page:起始頁
        end_page:終止頁
    '''
    pool = Pool(4)
    for page in range(begin_page,end_page+1):
        pn = (page-1)*50

        filename = '第' + str(page) + '頁.html'

        full_url = url+'&pn='+str(pn) #組合完整的url
        #print(full_url)
        pool.apply_async(func=loadPage,args=(full_url,filename))
    pool.close()
    pool.join()
    print('下載全部完成')
if __name__ == '__main__':
    while True:
        kw =  input('請輸入你要爬取的貼吧關鍵字:').strip()
        beginPage = input('起始頁:').strip()
        endPage = input('終止頁:').strip()

        if (kw and beginPage and endPage):
            word = parse.urlencode({'kw':kw}) #轉換為url編碼
            url = 'http://tieba.baidu.com/f?'
            new_url = url+word #組合後的url,示例;http://tieba.baidu.com/f?kw=lol

            webSpider(new_url,int(beginPage),int(endPage))
            break
from urllib import parse
import urllib.request
import gevent
from gevent import monkey
monkey.patch_all()

def loadPage(url,filename):
    '''
        作用:根據url傳送請求,獲取伺服器響應檔案
        url:需要爬取的url地址
        filename:檔名
    '''
    print('正在下載',filename)

    ua_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
    #構建請求物件
    request = urllib.request.Request(url,headers=ua_headers)
    respond = urllib.request.urlopen(request) #返回類檔案物件
    print(respond.getcode())
    html = respond.read()

    '''儲存檔案到本地磁碟中'''
    print('正在儲存', filename)
    with open(filename, 'wb') as f:
        f.write(html)

def webSpider(url,begin_page,end_page):
    '''
        作用: 負責處理url,分配每個url去傳送請求
        url:需要去處理的第一個url
        begin_page:起始頁
        end_page:終止頁
    '''
    jobs = []
    for page in range(begin_page,end_page+1):
        pn = (page-1)*50

        filename = '第' + str(page) + '頁.html'

        full_url = url+'&pn='+str(pn) #組合完整的url
        #print(full_url)
        g = gevent.spawn(loadPage,full_url,filename)
        jobs.append(g)
    gevent.joinall(jobs)

if __name__ == '__main__':
    while True:
        kw =  input('請輸入你要爬取的貼吧關鍵字:').strip()
        beginPage = input('起始頁:').strip()
        endPage = input('終止頁:').strip()

        if (kw and beginPage and endPage):
            word = parse.urlencode({'kw':kw}) #轉換為url編碼
            url = 'http://tieba.baidu.com/f?'
            new_url = url+word #組合後的url,示例;http://tieba.baidu.com/f?kw=lol

            webSpider(new_url,int(beginPage),int(endPage))
            break