scrapy ------ 爬取豆瓣電影TOP250
阿新 • • 發佈:2019-01-03
轉載自 —> 原文
#items.py
# -*- coding: utf-8 -*-
import scrapy
class DoubanMovieItem(scrapy.Item):
ranking = scrapy.Field() #排名
movie_name = scrapy.Field() #電影名稱
score = scrapy.Field() #評分
score_num = scrapy.Field() #評論人數
#douban_spider.py
#-*- coding:utf-8 -*-
from scrapy.spider import Spider
from scrapyspider.items import DoubanMovieItem
import scrapy
class DoubanMovieTop250spider(Spider):
name = 'douban_movie_top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' ,
}
def start_requests(self):
url = 'https://movie.douban.com/top250'
yield scrapy.Request(url, headers=self.headers)
def parse(self,response):
item = DoubanMovieItem()
movies = response.xpath('//ol[@class="grid_view"]/li')
for movie in movies:
item['ranking' ] = movie.xpath('.//div[@class="pic"]/em/text()').extract()[0]
item['movie_name'] = movie.xpath('.//div[@class="hd"]/a/span[1]/text()').extract()[0]
item['score'] = movie.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
item['score_num'] = movie.xpath('.//div[@class="star"]/span[4]/text()').extract()[0]
yield item
next_url = response.xpath('//span[@class="next"]/a/@href').extract() #獲取下一頁連結
if next_url:
next_url = 'https://movie.douban.com/top250' + next_url[0]
yield scrapy.Request(next_url, headers=self.headers)