豆瓣電影資訊爬取並儲存到excel
阿新 • • 發佈:2019-01-31
import re
import openpyxl
import requests
from bs4 import BeautifulSoup
class Movie(object):
def __init__(self, title, href):
self.title = title
self.href = href
def set_director(self, director):
self.director = director
def get_director(self):
return self.director
def set_actor(self, actor):
self.actor = actor
def get_actor(self, ):
return self.actor
def set_type(self, type):
self.type = type
def get_type(self):
return self.type
def set_region(self, region):
self.region = region
def get_region(self):
return self.region
def set_year(self, year):
self.year = year
def get_year(self):
return self.year
url = "https://www.douban.com/doulist/3936288/?start=%s"
urls = []
pages = 10
for i in range(pages):
urls.append(url % (i * 25))
titles = []
urlElems = []
lists = []
for index in range(pages):
res = requests.get(urls[index])
soup = BeautifulSoup(str(res.content, "utf-8"), 'html.parser')
urlElems.extend(soup.select('.doulist-subject'))
for i in range(len(urlElems)):
strurl = urlElems[i].select('.title a')
title_content = re.findall(re.compile('target="_blank">\s+(.*)\s+</a>'), str(strurl))[0]
href_content = re.findall(re.compile('href="(.*)?"\s'), str(strurl))[0]
str_abstract = urlElems[i].select('.abstract')[0]
str_list = str_abstract.get_text().split("\n")
movie = Movie(title_content, href_content)
for ss in str_list:
movie.director = str_list[2]
movie.actor = str_list[4]
movie.type = str_list[6]
movie.region = str_list[8]
movie.year = str_list[10]
lists.append(movie)
wb = openpyxl.Workbook()
sheet = wb.get_active_sheet()
for i in range(len(lists)):
movie = lists[i]
sheet.cell(row=i + 1, column=1).value = movie.title
sheet.cell(row=i + 1, column=2).value = movie.href
sheet.cell(row=i + 1, column=3).value = movie.director
sheet.cell(row=i + 1, column=4).value = movie.actor
sheet.cell(row=i + 1, column=5).value = movie.type
sheet.cell(row=i + 1, column=6).value = movie.region
sheet.cell(row=i + 1, column=7).value = movie.year
wb.save("douban.xlsx")
print("ok")