廣度優先遍歷——爬蟲的python3實現
阿新 • • 發佈:2018-12-18
參考:https://www.cnblogs.com/goodhacker/p/3353146.html
採用廣度優先搜尋,使用python3語言進行網頁爬蟲
實驗工具:jupyter notebook
起始頁網址:https://www.cnblogs.com/goodhacker/p/3353146.html
目標網址:http://book.51cto.com/art/201012/236668.htm
原始碼:
#encoding=utf-8 from bs4 import BeautifulSoup import socket import urllib.request as request import zlib import re class MyCrawler: def __init__(self,seeds): #使用種子初始化url佇列 self.linkQuence=linkQuence() if isinstance(seeds,str): self.linkQuence.addUnvisitedUrl(seeds) if isinstance(seeds,list): for i in seeds: self.linkQuence.addUnvisitedUrl(i) print("Add the seeds url \"%s\" to the unvisited url list"%str(self.linkQuence.unVisited)) #抓取過程主函式 def crawling(self,seeds,crawl_count): #迴圈條件:待抓取的連結不空且專區的網頁不多於crawl_count while self.linkQuence.unVisitedUrlsEnmpy() is False and self.linkQuence.getVisitedUrlCount()<=crawl_count: #隊頭url出佇列 visitUrl=self.linkQuence.unVisitedUrlDeQuence() print("Pop out one url \"%s\" from unvisited url list"%visitUrl) if visitUrl =="http://book.51cto.com/art/201012/236668.htm": break if visitUrl is None or visitUrl=="": continue #獲取超連結 links=self.getHyperLinks(visitUrl) #links=getHyperLinks(visitUrl) print("Get %d new links"%len(links)) #將url放入已訪問的url中 self.linkQuence.addVisitedUrl(visitUrl) print("Visited url count: "+str(self.linkQuence.getVisitedUrlCount())) #未訪問的url入列 for link in links: self.linkQuence.addUnvisitedUrl(link) print("%d unvisited links:"%len(self.linkQuence.getUnvisitedUrl())) #獲取原始碼中得超連結 def getHyperLinks(self,url): links=[] data=self.getPageSource(url) if data[0]=="200": soup=BeautifulSoup(data[1]) a=soup.findAll("a",{"href":re.compile(".*")}) for i in a: if i["href"].find("http://")!=-1: links.append(i["href"]) return links #獲取網頁原始碼 def getPageSource(self,url,timeout=20,coding=None): try: #socket.settime(100) socket.setdefaulttimeout(timeout) #req = urllib.request(url) req = request.Request(url) req.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)') response = request.urlopen(req) #if coding is None: #coding= response.headers.getparam("charset") if coding is None: page=response.read() else: page=response.read() page=page.decode(coding).encode('utf-8') return ["200",page] except Exception as e: print(str(e)) return [str(e),None] class linkQuence: def __init__(self): #已訪問的url集合 self.visted=[] #待訪問的url集合 self.unVisited=[] #獲取訪問過的url佇列 def getVisitedUrl(self): return self.visted #獲取未訪問的url佇列 def getUnvisitedUrl(self): return self.unVisited #新增到訪問過得url佇列中 def addVisitedUrl(self,url): self.visted.append(url) #移除訪問過得url def removeVisitedUrl(self,url): self.visted.remove(url) #未訪問過得url出佇列 def unVisitedUrlDeQuence(self): try: return self.unVisited.pop() except: return None #保證每個url只被訪問一次 def addUnvisitedUrl(self,url): if url!="" and url not in self.visted and url not in self.unVisited: self.unVisited.insert(0,url) #獲得已訪問的url數目 def getVisitedUrlCount(self): return len(self.visted) #獲得未訪問的url數目 def getUnvistedUrlCount(self): return len(self.unVisited) #判斷未訪問的url佇列是否為空 def unVisitedUrlsEnmpy(self): return len(self.unVisited)==0 def main(seeds,crawl_count): craw=MyCrawler(seeds) craw.crawling(seeds,crawl_count) if __name__=="__main__": main(["https://www.cnblogs.com/goodhacker/p/3353146.html"],50)
執行結果:
遇到的問題是參考原文中用的是python2,在python3中一些函式發生改變,經過檢視python的官方文件解決。