遍歷 抽屜的頁碼
阿新 • • 發佈:2018-11-05
# -*- coding: utf-8 -*- import scrapy import sys import io sys.stout = io.TextIOWrapper(sys.stdout.buffer,encoding="gb18030") from scrapy.selector import Selector,HtmlXPathSelector from pyquery import PyQuery from scrapy.http import Request class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://dig.chouti.com/'] visited_list = set()#集合 防止重複的網頁 def parse(self, response): content = str(response.body, encoding="utf-8") pq = PyQuery(content) # item = pq.find("#content-list .item") # for i in item.items(): # print(i.find(".show-content ").text().strip()) # hsx = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]') # for obj in hsx: # a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first().strip() # print(a) pages = pq.find("#dig_lcpage li:gt(0)") for page in pages.items(): index_web = page.find("a").attr("href") web = "https://dig.chouti.com%s" % index_web if web in self.visited_list or index_web == None: pass else: self.visited_list.add(web) print(web) yield Request(url=web, callback=self.parse)#給排程器用回撥函式解析
(venv) D:\shan>scrapy crawl chouti --nolog https://dig.chouti.com/all/hot/recent/2 https://dig.chouti.com/all/hot/recent/3 https://dig.chouti.com/all/hot/recent/4 https://dig.chouti.com/all/hot/recent/5 https://dig.chouti.com/all/hot/recent/6 https://dig.chouti.com/all/hot/recent/7 https://dig.chouti.com/all/hot/recent/8 https://dig.chouti.com/all/hot/recent/9 https://dig.chouti.com/all/hot/recent/10 https://dig.chouti.com/all/hot/recent/1 https://dig.chouti.com/all/hot/recent/11 https://dig.chouti.com/all/hot/recent/12 https://dig.chouti.com/all/hot/recent/13 https://dig.chouti.com/all/hot/recent/14
如果要限制遞迴的層數 可以在settings檔案裡設定DEPTH_LIMIT=你要限制的層數,
新增請求頭也在settings裡。