1. 程式人生 > >豆瓣電影資訊爬取並儲存到excel

豆瓣電影資訊爬取並儲存到excel

import re

import openpyxl
import requests
from bs4 import BeautifulSoup


class Movie(object):
    def __init__(self, title, href):
        self.title = title
        self.href = href

    def set_director(self, director):
        self.director = director

    def get_director(self):
        return
self.director def set_actor(self, actor): self.actor = actor def get_actor(self, ): return self.actor def set_type(self, type): self.type = type def get_type(self): return self.type def set_region(self, region): self.region = region def
get_region(self):
return self.region def set_year(self, year): self.year = year def get_year(self): return self.year url = "https://www.douban.com/doulist/3936288/?start=%s" urls = [] pages = 10 for i in range(pages): urls.append(url % (i * 25)) titles = [] urlElems = [] lists = [] for
index in range(pages): res = requests.get(urls[index]) soup = BeautifulSoup(str(res.content, "utf-8"), 'html.parser') urlElems.extend(soup.select('.doulist-subject')) for i in range(len(urlElems)): strurl = urlElems[i].select('.title a') title_content = re.findall(re.compile('target="_blank">\s+(.*)\s+</a>'), str(strurl))[0] href_content = re.findall(re.compile('href="(.*)?"\s'), str(strurl))[0] str_abstract = urlElems[i].select('.abstract')[0] str_list = str_abstract.get_text().split("\n") movie = Movie(title_content, href_content) for ss in str_list: movie.director = str_list[2] movie.actor = str_list[4] movie.type = str_list[6] movie.region = str_list[8] movie.year = str_list[10] lists.append(movie) wb = openpyxl.Workbook() sheet = wb.get_active_sheet() for i in range(len(lists)): movie = lists[i] sheet.cell(row=i + 1, column=1).value = movie.title sheet.cell(row=i + 1, column=2).value = movie.href sheet.cell(row=i + 1, column=3).value = movie.director sheet.cell(row=i + 1, column=4).value = movie.actor sheet.cell(row=i + 1, column=5).value = movie.type sheet.cell(row=i + 1, column=6).value = movie.region sheet.cell(row=i + 1, column=7).value = movie.year wb.save("douban.xlsx") print("ok")