1. 程式人生 > >scrapy爬取豆瓣電影top250

scrapy爬取豆瓣電影top250

imp port 爬取 all lba item text request top

 1 # -*- coding: utf-8 -*-
 2 # scrapy爬取豆瓣電影top250
 3 
 4 import scrapy
 5 from douban.items import DoubanItem
 6 
 7 
 8 class DoubanspiderSpider(scrapy.Spider):
 9     name = "doubanspider"
10     # allowed_domains = ["movie.douban.com/top250"]註意這裏的主頁限制,一旦翻頁可能超出範圍
11     start_urls = [http://movie.douban.com/top250
] 12 13 def parse(self, response): 14 item = DoubanItem() 15 for each in response.css(.article .grid_view li): 16 title = each.css(.item .hd .title:nth-child(1)::text).extract_first() 17 content = each.css(.item .bd p::text).extract_first().strip()
18 rating_num = each.css(.item .bd .star .rating_num::text).extract_first() 19 quote = each.css(.item .bd .quote span::text).extract_first() 20 image = each.css(.item .pic a img::attr(src)).extract_first() 21 item[title] = title 22 item[
content] = content 23 item[rating_num] = rating_num 24 item[quote] = quote 25 item[image] = image 26 27 yield item 28 29 # 構造下一頁的請求 30 next = response.css(.paginator .next a::attr(href)).extract_first() 31 if next: 32 url = http://movie.douban.com/top250 + next 33 print(url) 34 yield scrapy.Request(url=url, callback=self.parse)

scrapy爬取豆瓣電影top250