爬蟲Spider--爬取貼吧
阿新 • • 發佈:2018-12-26
輸入起始頁的靈活爬取
# - * - coding: UTF-8 - * - """ import urllib2 url = "http://www.baidu.com" #IE 9.0 的 User-Agent,包含在 ua_header裡 ua_header = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} # url 連同 headers,一起構造Request請求,這個請求將附帶 IE9.0 瀏覽器的User-Agent request = urllib2.Request(url, headers = ua_header) # 向伺服器傳送這個請求 response = urllib2.urlopen(request) html = response.read() print html import urllib #負責url編碼處理 import urllib2 url = "http://www.baidu.com/s" word = {"wd":"華育興業"} word = urllib.urlencode(word) #轉換成url編碼格式(字串) newurl = url + "?" + word # url首個分隔符就是 ? headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} request = urllib2.Request(newurl, headers=headers) response = urllib2.urlopen(request) print response.read() """ # 爬蟲貼吧例子 import sys import urllib import urllib2 if sys.getdefaultencoding() != 'utf-8': reload(sys) sys.setdefaultencoding('utf-8') def writeFile(html, filename): """ 作用:儲存伺服器響應檔案到本地磁碟檔案裡 html: 伺服器響應檔案 filename: 本地磁碟檔名 """ # print "正在儲存" + filename # Fs=open(filename, 'w+') # Fs.write(html) # Fs.close() with open("d://123/"+filename.encode('gb2312'), 'w') as f: f.write(html) print "-" * 20 def tiebaSpider(url, beginPage, endPage): """ 作用:負責處理url,分配每個url去傳送請求 url:需要處理的第一個url beginPage: 爬蟲執行的起始頁面 endPage: 爬蟲執行的截止頁面 """ for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 filename = "第"+str(page) + "頁.html" # 組合為完整的 url,並且pn值每次增加50 fullurl = url + "&pn=" + str(pn) #print fullurl # 呼叫loadPage()傳送請求獲取HTML頁面 html = loadPage(fullurl, filename) # 將獲取到的HTML頁面寫入本地磁碟檔案 writeFile(html, filename) def loadPage(url, filename): ''' 作用:根據url傳送請求,獲取伺服器響應檔案 url:需要爬取的url地址 filename: 檔名 ''' print "正在下載" + filename headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) return response.read() # 模擬 main 函式 if __name__ == "__main__": kw = raw_input("請輸入需要爬取的貼吧:") # 輸入起始頁和終止頁,str轉成int型別 beginPage = int(raw_input("請輸入起始頁:")) endPage = int(raw_input("請輸入終止頁:")) url = "http://tieba.baidu.com/f?" key = urllib.urlencode({"kw": kw}) # 組合後的url示例:http://tieba.baidu.com/f?kw=lol url = url + key tiebaSpider(url, beginPage, endPage)
2.
# - * - coding: UTF-8 - * - import sys import urllib import urllib2 from lxml import etree if sys.getdefaultencoding() != 'utf-8': reload(sys) sys.setdefaultencoding('utf-8') class Spider: def __init__(self): self.beginPage = int(raw_input("請輸入起始頁:")) self.endPage = int(raw_input("請輸入終止頁:")) self.url = 'http://duanziwang.com/category/%E6%90%9E%E7%AC%91%E5%9B%BE/' self.ua_header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"} #self.userName = 1 def tiebaSpider(self): for page in range(self.beginPage, self.endPage + 1): myUrl = self.url + str(page) + '/' links = self.loadImages(myUrl) def loadImages(self, link): req = urllib2.Request(link, headers=self.ua_header) html = urllib2.urlopen(req).read() selector = etree.HTML(html) imagesLinks = selector.xpath('//div/p/img/@src') imagesNames = selector.xpath('//div/p/img/@title') # 依次取出圖片路徑,下載儲存 for (imageslink, imagesname)in zip (imagesLinks, imagesNames): self.writeImages(imageslink,imagesname) def writeImages(self, imagesLink,imagesName): ''' 將 images 裡的二進位制內容存入到 userNname 檔案中 ''' print "正在儲存檔案 %s ..." % imagesName # 1. 開啟檔案,返回一個檔案物件 with open("d:/124/"+imagesName, "wb") as file: # 2. 獲取圖片裡的內容 image = urllib2.urlopen(imagesLink).read() # 3. 呼叫檔案物件write() 方法,將page_html的內容寫入到檔案裡 file.write(image) file.close() # 模擬 main 函式 if __name__ == "__main__": # 首先建立爬蟲物件 mySpider = Spider() # 呼叫爬蟲物件的方法,開始工作 mySpider.tiebaSpider()
3.爬取段子網
# -*- coding: utf-8 -*- import urllib2 import re class Spider: """ 段子網 """ def loadPage(self, page): """ @brief 定義一個url請求網頁的方法 @param page 需要請求的第幾頁 @returns 返回的頁面html """ url = "http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/" + str(page)+ "/" #User-Agent頭 user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' headers = {'User-Agent': user_agent} req = urllib2.Request(url, headers = headers) response = urllib2.urlopen(req) html = response.read() pattern = re.compile(r'(<p>|</p>)') html = pattern.sub("",html) pattern = re.compile(r'<br>') html = pattern.sub("\\n", html) pattern = re.compile(r'<div class="post-content">(.*?)</div>', re.S) item_list = pattern.findall(html) return item_list def printOnePage(self, item_list, page): """ @brief 處理得到的段子列表 @param item_list 得到的段子列表 @param page 處理第幾頁 """ print item_list print "******* 第 %d 頁 爬取完畢...*******" % page for item in item_list: self.writeToFile(item) def writeToFile(self, text): ''' @brief 將資料追加寫進檔案中 @param text 檔案內容 ''' with open("d:/124/duanzi.txt", 'a') as myFile: myFile.write(text) myFile.write("\r\n-----------------------------------------------------") myFile.close() def doWork(self): ''' 讓爬蟲開始工作 ''' while self.enable: try: item_list = self.loadPage(self.page) except urllib2.URLError, e: print e.reason continue # 對得到的段子item_list處理 self.printOnePage(item_list, self.page) self.page += 1 # 此頁處理完畢,處理下一頁 print "按回車繼續..." print "輸入 quit 退出" command = raw_input() if (command == "quit"): break if __name__ == '__main__': """ ====================== 段子網小爬蟲 ====================== """ #定義一個Spider物件 mySpider = Spider() mySpider.page = 1 mySpider.enable = True mySpider.doWork()