1. 程式人生 > >爬取多個url

爬取多個url

# -*- coding: utf-8 -*-
import scrapy
from qiubai.items import QiubaiItem


class QiushibaiSpider(scrapy.Spider):
    name = 'qiushibai'
    # allowed_domains = ['www.qiushibaike.com/text/']
    start_urls = ['http://www.qiushibaike.com/text/']
    url = "https://www.qiushibaike.com/text/page/%d/"
    page = 1
    def
parse(self, response): # 建議大家使用xpath進行指定內容的解析(框架集成了xpath解析的介面) # 段子的內容和作者 div_list = response.xpath('//div[@id="content-left"]/div') # data_list = [] for div in div_list: # xpath解析到的指定內容被儲存到了Selector物件 # extract()該方法可以將Selector物件中儲存的資料值拿到
# author = div.xpath("./div/a[2]/h2/text()").extract()[0] # extract_first() == extract()[0] author = div.xpath("./div/a[2]/h2/text()").extract_first() content = div.xpath('.//div[@class="content"]/span/text()').extract_first() # 將解析到數值的資料儲存到item物件 item = QiubaiItem() item[
"author"] = author item["content"] = content # 將item物件提交給管道 yield item if self.page <= 13: print("正在爬取第%d頁" % self.page) self.page += 1 new_url = format(self.url % self.page) yield scrapy.Request(url=new_url, callback=self.parse) # data_list.append(data) # return data_list

用yield    callback