1. 程式人生 > >scrapy ------ 爬取豆瓣電影TOP250

scrapy ------ 爬取豆瓣電影TOP250

轉載自 —> 原文

#items.py
# -*- coding: utf-8 -*-
import scrapy

class DoubanMovieItem(scrapy.Item):
    ranking = scrapy.Field()        #排名
    movie_name = scrapy.Field()     #電影名稱
    score = scrapy.Field()          #評分
    score_num = scrapy.Field()      #評論人數
#douban_spider.py
#-*- coding:utf-8 -*-
from scrapy.spider import Spider from scrapyspider.items import DoubanMovieItem import scrapy class DoubanMovieTop250spider(Spider): name = 'douban_movie_top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
, } def start_requests(self): url = 'https://movie.douban.com/top250' yield scrapy.Request(url, headers=self.headers) def parse(self,response): item = DoubanMovieItem() movies = response.xpath('//ol[@class="grid_view"]/li') for movie in movies: item['ranking'
] = movie.xpath('.//div[@class="pic"]/em/text()').extract()[0] item['movie_name'] = movie.xpath('.//div[@class="hd"]/a/span[1]/text()').extract()[0] item['score'] = movie.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract()[0] item['score_num'] = movie.xpath('.//div[@class="star"]/span[4]/text()').extract()[0] yield item next_url = response.xpath('//span[@class="next"]/a/@href').extract() #獲取下一頁連結 if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield scrapy.Request(next_url, headers=self.headers)