python爬蟲 電影頁面資訊 xpath csv寫入 圖片儲存到本地
阿新 • • 發佈:2018-11-24
import re import requests from lxml import etree import time import urllib.request import csv import os # 獲取電影詳情 def getMoviesDetail(id,score): movies_id = re.sub(r'/films/', '', id) details_url = 'http://maoyan.com/films/' + movies_id print(details_url) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' } # 定義代理 proxy_addr = { 'http': '88.146.227.253:8080' } details_content = requests.get(details_url, headers=headers,proxies = proxy_addr).text html = etree.HTML(details_content) # 過濾出電影名稱 name = html.xpath('//div[@class="movie-brief-container"]/h3/text()')[0] # 過濾出地區 上映時間 region_showTime = html.xpath('//div[@class="movie-brief-container"]/ul/li[3]/text()')[0] pattern = re.compile(u"[\u4e00-\u9fa5]+") region = re.findall(pattern, region_showTime)[0] pattern = re.compile(u"[\u4e00-\u9fa5]+") show_time = re.sub(pattern,"",region_showTime) # 過濾出電影時長 duartion = html.xpath('//div[@class="movie-brief-container"]/ul/li[2]/text()')[0] pattern = re.compile(r"\d+") duartion = re.findall(pattern, duartion)[0] # 過濾出圖片連結 image_url = html.xpath('//div[@class="avatar-shadow"]/img/@src')[0] # 將電影資訊寫入csv文件 data = [name, score, region, show_time, duartion] writerDataTocsv(data) # 將圖片下載操本地 dowloadImage(image_url,name) # 獲取電影id def getMoviesId(): url = "http://maoyan.com/films" #設定請求頭 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' } # 定義代理 proxy_addr ={ 'http': '88.146.227.253:8080' } content = requests.get(url,headers= headers,proxies = proxy_addr).text.encode('utf-8') html = etree.HTML(content) # xpath過濾 films_list =html.xpath('//div[@class = "movies-list"]/dl//div[@class="movie-item"]/a/@href') # 將標題先存入 data = ['電影名稱', '電影評分', '上映地區', '上映時間', '電影時長'] writerDataTocsv(data) # 通過id迴圈呼叫下載詳情頁 for i in films_list: getMoviesDetail(i,90) time.sleep(3) # 資料寫入csv def writerDataTocsv(data): try: with open('movie_info.csv', 'a+') as csvfile: writer = csv.writer(csvfile) writer.writerow(data) csvfile.close() except: print("寫入檔案錯誤") # 下載圖片到本地 def dowloadImage(image_url,name): file_path = 'moviesImage' try: if not os.path.exists(file_path): os.makedirs(file_path) filename = '{}{}{}{}'.format(file_path,os.sep,name,'.jpg',) urllib.request.urlretrieve(image_url,filename=filename) except IOError as e: print('檔案操作失敗',e) getMoviesId()