requests+re+multiprocessing爬取貓眼電影top100
阿新 • • 發佈:2019-04-05
core except exce mat RoCE yield tle itl pan
import re import json import requests from multiprocessing import Pool from requests.exceptions import RequestException def get_one_page(url): """ 獲取單頁面信息 :param url: :return: """ try: response = requests.get(url) if response.status_code == 200:View Codereturn response.text return None except RequestException: return None def parse_one_page(html): """ 解析頁面信息 :param html: :return: """ pattern = re.compile(‘<dd>.*?board-index.*?>(\d+)</i>.*?poster-default.*?src="(.*?)"‘ ‘.*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)‘ ‘</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>‘, re.S) items = re.findall(pattern, html) for item in items: yield { "index": item[0],"image": item[1], "title": item[2], "star": item[3].strip()[3:], "time": item[4].strip()[5:], "score": item[5]+item[6] } def save_to_file(content): """ 將信息保存到文件中 :param content: :return: """ with open("maoyan.txt", "a", encoding="utf-8") as f: f.write(json.dumps(content, ensure_ascii=False) + "\n") def main(offset): url = "https://maoyan.com/board/4?offset={}".format(offset) html = get_one_page(url) for item in parse_one_page(html): save_to_file(item) if __name__ == "__main__": # for i in range(10): # main(i*10) # 使用多進程請求多個url來減少網絡等待浪費的時間 # map默認異步執行任務、自帶close和join功能 pool = Pool() pool.map(main, [i*10 for i in range(10)])
requests+re+multiprocessing爬取貓眼電影top100