Requests+正則表達式 爬取貓眼電影
阿新 • • 發佈:2017-07-03
movies core http status roc find apple ascii int
代碼:
import re import json from multiprocessing import Pool import requests from requests.exceptions import RequestException basic_url = ‘http://maoyan.com/board/4?offset=%d‘ headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36‘, } file = open("maoyan_movies.txt", ‘a‘, encoding="utf-8") def get_page(url): try: response = requests.get(url, headers=headers) if response.status_code == requests.codes.ok: return response.text else: return None except RequestException:return None def parse_page(content): pattern = re.compile( ‘<dd>.*?board-index.*?>(\d+)</i>‘ ‘.*?<img data-src="(.*?)"‘ ‘.*?class="name"><a.*?>(.*?)</a>‘ ‘.*?class="star">(.*?)</p>‘ ‘.*?class="releasetime">(.*?)</p>‘ ‘.*?class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i>‘ ‘.*?</dd>‘, re.S) items = pattern.findall(content) for item in items: yield { ‘id‘: item[0], ‘image‘: item[1], ‘name‘: item[2].strip(), ‘actor‘: item[3].strip()[3:], ‘releasetime‘: item[4][5:], ‘score‘: item[5] + item[6], } def save_to_file(content): json.dump(content, file, ensure_ascii=False) file.write(‘\n‘) def get_page_movies(offset): ‘‘‘ 獲取一頁的電影信息 offset用來構建完整的網頁url,以10為最小單位 ‘‘‘ step = 10 url = basic_url % (step * offset) html = get_page(url) for movie_info in parse_page(html): save_to_file(movie_info) # 獲取貓眼電影top100的電影信息: 排名,圖片url,電影名,主演,上映日期,評分 def get_top_100_movies(): offset_list = [i for i in range(10)] pool = Pool(processes=4) pool.map(get_page_movies, offset_list) pool.close() pool.join() if __name__ == "__main__": get_top_100_movies()
Requests+正則表達式 爬取貓眼電影