1. 程式人生 > >遍歷 抽屜的頁碼

遍歷 抽屜的頁碼

# -*- coding: utf-8 -*-
import scrapy
import sys
import io
sys.stout = io.TextIOWrapper(sys.stdout.buffer,encoding="gb18030")
from scrapy.selector import Selector,HtmlXPathSelector
from pyquery import PyQuery
from scrapy.http import Request

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']
    visited_list = set()#集合 防止重複的網頁

    def parse(self, response):
        content = str(response.body, encoding="utf-8")
        pq = PyQuery(content)
        # item = pq.find("#content-list .item")
        # for i in item.items():
        #     print(i.find(".show-content ").text().strip())


        # hsx = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]')
        # for obj in hsx:
        #     a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first().strip()
        #     print(a)
        pages = pq.find("#dig_lcpage li:gt(0)")
        for page in pages.items():
            index_web = page.find("a").attr("href")
            web = "https://dig.chouti.com%s" % index_web
            if web in self.visited_list or index_web == None:
                pass
            else:
                self.visited_list.add(web)
                print(web)
                yield Request(url=web, callback=self.parse)#給排程器用回撥函式解析
(venv) D:\shan>scrapy crawl chouti --nolog
https://dig.chouti.com/all/hot/recent/2
https://dig.chouti.com/all/hot/recent/3
https://dig.chouti.com/all/hot/recent/4
https://dig.chouti.com/all/hot/recent/5
https://dig.chouti.com/all/hot/recent/6
https://dig.chouti.com/all/hot/recent/7
https://dig.chouti.com/all/hot/recent/8
https://dig.chouti.com/all/hot/recent/9
https://dig.chouti.com/all/hot/recent/10
https://dig.chouti.com/all/hot/recent/1
https://dig.chouti.com/all/hot/recent/11
https://dig.chouti.com/all/hot/recent/12
https://dig.chouti.com/all/hot/recent/13
https://dig.chouti.com/all/hot/recent/14

 

 

如果要限制遞迴的層數 可以在settings檔案裡設定DEPTH_LIMIT=你要限制的層數,

新增請求頭也在settings裡。