豆瓣爬蟲中的一些注意事項
阿新 • • 發佈:2018-12-03
實現時所用的知識點
1.建立程序池(加快爬去)和程序池佇列(實現程序之間的通訊)來完成抓取
2.利用佇列(這裡用list代替)實現去重
去重有兩部分。一部分是在爬去前判斷要怕去的url是否在以爬取的url佇列裡面;另一部分時當不在以爬取的佇列中時,將其放入帶爬取的佇列時,判斷是否在待爬取的佇列中,再的話則丟棄。
3.使用佇列模擬廣度優先遍歷實現url的爬取
4.程序池是爬取url時用的,程序池佇列時用來程序之間通訊的,去重佇列是用來去重的
將帶爬取的url和程序池佇列(傳遞url)放入程序池中爬取,這樣的URL是同一組url
# -*- coding: utf-8 -*- """ Created on Tue May 29 10:33:56 2018 @author: Administrator """ from bs4 import BeautifulSoup import re import basicSpider from multiprocessing import Pool,Manager def get_html(url): """ 獲取一頁的網頁原始碼資訊 """ headers = [("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")] #proxy = {"http":"182.129.243.84:9000"} html = basicSpider.downloadHtml(url, headers=headers) return html def get_movie_all(html): """ 獲取當前頁面中所有的電影的列表資訊 """ soup = BeautifulSoup(html, "html.parser") movie_list = soup.find_all('div', class_='bd doulist-subject') #print(movie_list) return movie_list def get_movie_one(movie): """ 獲取一部電影的精細資訊,最終拼成一個大的字串 """ result = "" soup = BeautifulSoup(str(movie),"html.parser") title = soup.find_all('div', class_="title") soup_title = BeautifulSoup(str(title[0]), "html.parser") for line in soup_title.stripped_strings: result += line try: score = soup.find_all('span', class_='rating_nums') score_ = BeautifulSoup(str(score[0]), "html.parser") for line in score_.stripped_strings: result += "|| 評分:" result += line except: result += "|| 評分:5.0" abstract = soup.find_all('div', class_='abstract') abstract_info = BeautifulSoup(str(abstract[0]), "html.parser") for line in abstract_info.stripped_strings: result += "|| " result += line result += '\n' #print(result) return result def save_file(movieInfo, lock): """ 寫檔案的操作,這裡使用的追加的方式來寫檔案 """ with open("doubanMovie.txt","ab") as f: #lock.acquire() f.write(movieInfo.encode("utf-8")) #lock.release() def CrawlMovieInfo(url, q, lock): """ 抓取電影一頁資料,並寫入檔案 """ html = get_html(url) movie_list = get_movie_all(html) for it in movie_list: save_file(get_movie_one(it), lock) q.put(url) #已完成的url if __name__ == "__main__": # 建立程序池和程序池佇列來完成抓取 pool = Pool() q = Manager().Queue() lock = Manager().Lock() url = "https://www.douban.com/doulist/3516235/?start=225&sort=seq&sub_type=" CrawlMovieInfo(url) html = get_html(url) #正則表示式注意引號的使用 pattern = re.compile('(https://www.douban.com/doulist/3516235/\?start=.*)"') itemUrls = re.findall(pattern, html) # for i in itemUrls: # print(i) # 兩步去重操作 crawl_queue = [] # 待爬佇列 crawled_queue = [] # 已爬取佇列 for item in itemUrls: if item not in crawled_queue: # 第一步去重,確定這些url不在已爬佇列中 crawl_queue.append(item) #第二步去重,對待爬佇列去重 crawl_queue = list(set(crawl_queue)) # 模擬廣度優先遍歷 while crawl_queue: #去待爬佇列中取值,直到待爬佇列為空 url = crawl_queue.pop(0)#取出待爬佇列中第一個值 #CrawlMovieInfo(url) pool.apply_async(func=CrawlMovieInfo, args=(url,q,lock)) # 把已經處理完的url放入已經爬取的佇列中 urlCompeted = q.get() crawled_queue.append(urlCompeted) pool.close() pool.join()