多種方法爬取貓眼電影Top100排行榜,儲存到csv檔案,下載封面圖
阿新 • • 發佈:2019-01-09
參考連結:https://blog.csdn.net/BF02jgtRS00XKtCx/article/details/83663400
因貓眼網站有些更新,參考連結中的部分程式碼執行報錯,特修改一下
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import re
from multiprocessing.pool import Pool
import requests
from bs4 import BeautifulSoup
from lxml import etree
from requests.exceptions import RequestException
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
return None
# 獲取封面大圖
def get_thumb(url):
# url = 'https://p0.meituan.net/movie/[email protected]_220h_1e_1c'
pattern = re.compile(r'(.*?)@.*?')
thumb = re.search(pattern, url)
return thumb.group(1)
# http://p0.meituan.net/movie/ [email protected]_220h_1e_1c
# 去掉@160w_220h_1e_1c就是大圖
# 提取上映時間函式
def get_release_time(data):
pattern = re.compile(r'(.*?)(\(|$)')
items = re.search(pattern, data)
if items is None:
return '未知'
return items.group(1) # 返回匹配到的第一個括號(.*?)中結果即時間
# 提取國家/地區函式
def get_release_area(data):
pattern = re.compile(r'.*\((.*)\)')
# $表示匹配一行字串的結尾,這裡就是(.*?);(|$,表示匹配字串含有(,或者只有(.*?)
items = re.search(pattern, data)
if items is None:
return '未知'
return items.group(1)
# 使用正則表示式的寫法
def parse_one_page(html):
pattern = re.compile(
'<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',
re.S) # re.S表示匹配任意字元,如果不加,則無法匹配換行符
items = re.findall(pattern, html)
for item in items:
yield {
'index': item[0],
'thumb': get_thumb(item[1]), # 定義get_thumb()方法進一步處理網址
'name': item[2],
'star': item[3].strip()[3:],
# 'time': item[4].strip()[5:],
# 用一個方法分別提取time裡的日期和地區
'time': get_release_time(item[4].strip()[5:]),
'area': get_release_area(item[4].strip()[5:]),
'score': item[5].strip() + item[6].strip()
# 評分score由整數+小數兩部分組成
}
# lxml結合xpath提取
def parse_one_page2(html):
parse = etree.HTML(html)
items = parse.xpath('/html/body/div[4]//div//dd')
for item in items:
yield {
'index': item.xpath('./i/text()')[0],
'thumb': get_thumb(str(item.xpath('./a/img[2]/@data-src')[0].strip())),
'name': item.xpath('./div/div/div[1]/p[1]/a/@title')[0],
'star': item.xpath('.//p[@class="star"]/text()')[0].strip()[3:],
'realease_time': get_release_time(item.xpath('.//p[@class="releasetime"]/text()')[0].strip()[5:]),
'area': get_release_area(item.xpath('.//p[@class="releasetime"]/text()')[0].strip()[5:]),
'score': item.xpath('./div/div/div[2]/p/i[1]/text()')[0] + item.xpath('./div/div/div[2]/p/i[2]/text()')[0],
}
# 使用BeautifulSoup結合css選擇器
def parse_one_page3(html):
soup = BeautifulSoup(html, 'lxml')
items = range(10)
for item in items:
yield {
'index': soup.select('i.board-index')[item].string,
'thumb': get_thumb(soup.select('.board-img')[item]['data-src']),
'name': soup.select('.name a')[item].string,
'star': soup.select('.star')[item].string.strip()[3:],
'time': get_release_time(soup.select('.releasetime')[item].string.strip()[5:]),
'area': get_release_area(soup.select('.releasetime')[item].string.strip()[5:]),
'score': soup.select('.integer')[item].string + soup.select('.fraction')[item].string
}
# Beautiful Soup + find_all函式提取
def parse_one_page4(html):
soup = BeautifulSoup(html, 'lxml')
items = range(10)
for item in items:
yield {
'index': soup.find_all(class_='board-index')[item].string,
'thumb': get_thumb(soup.find_all(class_='board-img')[item].attrs['data-src']),
'name': soup.find_all(name='p', attrs={'class': 'name'})[item].string,
'star': soup.find_all(name='p', attrs={'class': 'star'})[item].string.strip()[3:],
'time': get_release_time(soup.find_all(class_='releasetime')[item].string.strip()[5:]),
'area': get_release_area(soup.find_all(class_='releasetime')[item].string.strip()[5:]),
'score': soup.find_all(name='i', attrs={'class': 'integer'})[item].string +
soup.find_all(name='i', attrs={'class': 'fraction'})[item].string
}
# 資料儲存到csv
def write_to_file3(item):
with open('貓眼top100.csv', 'a', encoding='utf_8_sig', newline='') as f:
# 'a'為追加模式(新增)
# utf_8_sig格式匯出csv不亂碼
fieldnames = ['index', 'thumb', 'name', 'star', 'time', 'area', 'score']
w = csv.DictWriter(f, fieldnames=fieldnames)
# w.writeheader()
w.writerow(item)
# 下載封面圖片
def download_thumb(name, url, num):
try:
response = requests.get(url)
with open('封面圖/' + name + '.jpg', 'wb') as f:
f.write(response.content)
print('第%s部電影封面下載完畢' % num)
print('------')
except RequestException as e:
print(e)
pass
# 不能是w,否則會報錯,因為圖片是二進位制資料所以要用wb
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
for item in parse_one_page4(html):
write_to_file3(item)
download_thumb(item['name'], item['thumb'], item['index'])
if __name__ == '__main__':
pool = Pool()
pool.map(main, [i * 10 for i in range(10)])