使用線程池進行爬蟲
阿新 • • 發佈:2018-01-17
線程 detail pip3 dsw ont form exce done pat
import requests #pip3 install requests import re import hashlib import time from concurrent.futures import ThreadPoolExecutor pool=ThreadPoolExecutor(50) movie_path=r‘C:\mp4‘ def get_page(url): try: response=requests.get(url) if response.status_code == 200: return response.textexcept Exception: pass def parse_index(index_page): index_page=index_page.result() urls=re.findall(‘class="items".*?href="(.*?)"‘,index_page,re.S) for detail_url in urls: if not detail_url.startswith(‘http‘): detail_url=‘http://www.xiaohuar.com‘+detail_url pool.submit(get_page,detail_url).add_done_callback(parse_detail)def parse_detail(detail_page): detail_page=detail_page.result() l=re.findall(‘id="media".*?src="(.*?)"‘,detail_page,re.S) if l: movie_url=l[0] if movie_url.endswith(‘mp4‘): pool.submit(get_movie,movie_url) def get_movie(url): try: response=requests.get(url)if response.status_code == 200: m=hashlib.md5() m.update(str(time.time()).encode(‘utf-8‘)) m.update(url.encode(‘utf-8‘)) filepath=‘%s\%s.mp4‘ %(movie_path,m.hexdigest()) with open(filepath,‘wb‘) as f: f.write(response.content) print(‘%s 下載成功‘ %url) except Exception: pass def main(): base_url=‘http://www.xiaohuar.com/list-3-{page_num}.html‘ for i in range(5): url=base_url.format(page_num=i) pool.submit(get_page,url).add_done_callback(parse_index) if __name__ == ‘__main__‘: main()
使用線程池進行爬蟲