scrapy爬取豆瓣電影top250
阿新 • • 發佈:2017-06-28
imp port 爬取 all lba item text request top
1 # -*- coding: utf-8 -*- 2 # scrapy爬取豆瓣電影top250 3 4 import scrapy 5 from douban.items import DoubanItem 6 7 8 class DoubanspiderSpider(scrapy.Spider): 9 name = "doubanspider" 10 # allowed_domains = ["movie.douban.com/top250"]註意這裏的主頁限制,一旦翻頁可能超出範圍 11 start_urls = [‘http://movie.douban.com/top250‘] 12 13 def parse(self, response): 14 item = DoubanItem() 15 for each in response.css(‘.article .grid_view li‘): 16 title = each.css(‘.item .hd .title:nth-child(1)::text‘).extract_first() 17 content = each.css(‘.item .bd p::text‘).extract_first().strip()18 rating_num = each.css(‘.item .bd .star .rating_num::text‘).extract_first() 19 quote = each.css(‘.item .bd .quote span::text‘).extract_first() 20 image = each.css(‘.item .pic a img::attr(src)‘).extract_first() 21 item[‘title‘] = title 22 item[‘content‘] = content 23 item[‘rating_num‘] = rating_num 24 item[‘quote‘] = quote 25 item[‘image‘] = image 26 27 yield item 28 29 # 構造下一頁的請求 30 next = response.css(‘.paginator .next a::attr(href)‘).extract_first() 31 if next: 32 url = ‘http://movie.douban.com/top250‘ + next 33 print(url) 34 yield scrapy.Request(url=url, callback=self.parse)
scrapy爬取豆瓣電影top250