1. 程式人生 > >廣度優先遍歷——爬蟲的python3實現

廣度優先遍歷——爬蟲的python3實現

參考:https://www.cnblogs.com/goodhacker/p/3353146.html

採用廣度優先搜尋,使用python3語言進行網頁爬蟲

實驗工具:jupyter notebook

起始頁網址:https://www.cnblogs.com/goodhacker/p/3353146.html

目標網址:http://book.51cto.com/art/201012/236668.htm

原始碼:

#encoding=utf-8
from bs4 import BeautifulSoup
import socket 
import urllib.request as request
import zlib 
import re
      
class MyCrawler:  
    def __init__(self,seeds):  
            #使用種子初始化url佇列  
            self.linkQuence=linkQuence()  
            if isinstance(seeds,str):  
                self.linkQuence.addUnvisitedUrl(seeds)  
            if isinstance(seeds,list):  
                for i in seeds:  
                    self.linkQuence.addUnvisitedUrl(i)  
            print("Add the seeds url \"%s\" to the unvisited url list"%str(self.linkQuence.unVisited))  
    
        #抓取過程主函式  
    def crawling(self,seeds,crawl_count):  
            #迴圈條件:待抓取的連結不空且專區的網頁不多於crawl_count  
            while self.linkQuence.unVisitedUrlsEnmpy() is False and self.linkQuence.getVisitedUrlCount()<=crawl_count:  
                #隊頭url出佇列  
                visitUrl=self.linkQuence.unVisitedUrlDeQuence()  
                print("Pop out one url \"%s\" from unvisited url list"%visitUrl)
                if visitUrl =="http://book.51cto.com/art/201012/236668.htm":
                    break
                if visitUrl is None or visitUrl=="":  
                    continue  
        
                #獲取超連結  
                links=self.getHyperLinks(visitUrl) 
                #links=getHyperLinks(visitUrl) 
    
                print("Get %d new links"%len(links))  
                #將url放入已訪問的url中  
                self.linkQuence.addVisitedUrl(visitUrl)  
                print("Visited url count: "+str(self.linkQuence.getVisitedUrlCount()))  
                #未訪問的url入列  
                for link in links:  
                    self.linkQuence.addUnvisitedUrl(link)  
                print("%d unvisited links:"%len(self.linkQuence.getUnvisitedUrl())) 
                  
        
    #獲取原始碼中得超連結  
    def getHyperLinks(self,url):
        links=[]
        data=self.getPageSource(url)  
        if data[0]=="200":  
            soup=BeautifulSoup(data[1])  
            a=soup.findAll("a",{"href":re.compile(".*")})  
            for i in a:  
                if i["href"].find("http://")!=-1:  
                    links.append(i["href"])   
        return links  
          
        #獲取網頁原始碼  
    def getPageSource(self,url,timeout=20,coding=None):  
        try:  
            #socket.settime(100)
            socket.setdefaulttimeout(timeout) 
            
            #req = urllib.request(url) 
            req = request.Request(url) 
            
            req.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')  
           
            response = request.urlopen(req)  
            #if coding is None:  
                #coding= response.headers.getparam("charset")  
            if coding is None:  
                page=response.read()  
            else:  
                page=response.read()  
                page=page.decode(coding).encode('utf-8')  
            return ["200",page]  
        except Exception as e:  
            print(str(e))  
            return [str(e),None]  
              
class linkQuence:  
    def __init__(self):  
        #已訪問的url集合  
        self.visted=[]  
        #待訪問的url集合  
        self.unVisited=[]  
    #獲取訪問過的url佇列  
    def getVisitedUrl(self):  
        return self.visted  
    #獲取未訪問的url佇列  
    def getUnvisitedUrl(self):  
        return self.unVisited  
    #新增到訪問過得url佇列中  
    def addVisitedUrl(self,url):  
        self.visted.append(url)  
    #移除訪問過得url  
    def removeVisitedUrl(self,url):  
        self.visted.remove(url)  
    #未訪問過得url出佇列  
    def unVisitedUrlDeQuence(self):  
        try:  
            return self.unVisited.pop()  
        except:  
            return None  
    #保證每個url只被訪問一次  
    def addUnvisitedUrl(self,url):  
        if url!="" and url not in self.visted and url not in self.unVisited:  
            self.unVisited.insert(0,url)  
    #獲得已訪問的url數目  
    def getVisitedUrlCount(self):  
        return len(self.visted)  
    #獲得未訪問的url數目  
    def getUnvistedUrlCount(self):  
        return len(self.unVisited)  
        #判斷未訪問的url佇列是否為空  
    def unVisitedUrlsEnmpy(self):  
        return len(self.unVisited)==0  
          
    def main(seeds,crawl_count):  
        craw=MyCrawler(seeds)  
        craw.crawling(seeds,crawl_count)  
    if __name__=="__main__":  
        main(["https://www.cnblogs.com/goodhacker/p/3353146.html"],50)

執行結果:

遇到的問題是參考原文中用的是python2,在python3中一些函式發生改變,經過檢視python的官方文件解決。