1. 程式人生 > >Requests+正則表達式 爬取貓眼電影

Requests+正則表達式 爬取貓眼電影

movies core http status roc find apple ascii int

代碼:

import re
import json
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException


basic_url = http://maoyan.com/board/4?offset=%d
headers = {
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36
, } file = open("maoyan_movies.txt", a, encoding="utf-8") def get_page(url): try: response = requests.get(url, headers=headers) if response.status_code == requests.codes.ok: return response.text else: return None except RequestException:
return None def parse_page(content): pattern = re.compile( <dd>.*?board-index.*?>(\d+)</i> .*?<img data-src="(.*?)" .*?class="name"><a.*?>(.*?)</a> .*?class="star">(.*?)</p> .*?class="releasetime">(.*?)</p>
.*?class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i> .*?</dd>, re.S) items = pattern.findall(content) for item in items: yield { id: item[0], image: item[1], name: item[2].strip(), actor: item[3].strip()[3:], releasetime: item[4][5:], score: item[5] + item[6], } def save_to_file(content): json.dump(content, file, ensure_ascii=False) file.write(\n) def get_page_movies(offset): ‘‘‘ 獲取一頁的電影信息 offset用來構建完整的網頁url,以10為最小單位 ‘‘‘ step = 10 url = basic_url % (step * offset) html = get_page(url) for movie_info in parse_page(html): save_to_file(movie_info) # 獲取貓眼電影top100的電影信息: 排名,圖片url,電影名,主演,上映日期,評分 def get_top_100_movies(): offset_list = [i for i in range(10)] pool = Pool(processes=4) pool.map(get_page_movies, offset_list) pool.close() pool.join() if __name__ == "__main__": get_top_100_movies()

Requests+正則表達式 爬取貓眼電影