scrapy爬取中關村在線手機頻道
阿新 • • 發佈:2017-06-24
tex ice extract base .section title .html release nbsp
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from pyquery import PyQuery as pq 4 5 from zolphone.items import ZolphoneItem 6 7 8 class PhoneSpider(scrapy.Spider): 9 name = "phone" 10 # allowed_domains = ["www.zol.com.cn"] 11 # start_url = ‘http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_1.html‘12 start_url = ‘http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_‘ 13 14 def start_requests(self): 15 16 for page in range(1, 209): 17 url = self.start_url + str(page) + ‘.html‘ 18 yield scrapy.Request(url,callback=self.parse_index) 19 20 21 defparse_index(self, response): 22 base_url = ‘http://detail.zol.com.cn‘ 23 doc = pq(response.text) 24 lis = doc(‘.list-box .list-item‘).items() 25 for result in lis: 26 detail_url = base_url + result.find(‘.pro-intro h3 a‘).attr(‘href‘) 27 yield scrapy.Request(url=detail_url, callback=self.parse_detail)28 29 def parse_detail(self,response): 30 doc = pq(response.text) 31 title1 = response.css(‘.page-title h1::text‘).extract_first() 32 title2 = doc(‘.page-title h2‘).text() 33 price = doc(‘.product-price .price-type‘).text() 34 release_time = doc(‘.section div h3 .showdate‘).text() 35 print(title1, title2, price, release_time) 36 item = ZolphoneItem() 37 item[‘title1‘] = title1 38 item[‘title2‘] = title2 39 item[‘price‘] = price 40 item[‘release_time‘] = release_time 41 42 yield item
1 import scrapy 2 3 4 class ZolphoneItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 title1 = scrapy.Field() 8 title2 = scrapy.Field() 9 price = scrapy.Field() 10 release_time = scrapy.Field()
scrapy爬取中關村在線手機頻道