通過相對路徑獲取下一頁的連結
阿新 • • 發佈:2018-11-28
# -*- coding: utf-8 -*- import scrapy import re from urllib.parse import urljoin from ..items import ScrapyItem class DocsScrapySpider(scrapy.Spider): name = 'docs.scrapy' allowed_domains = ['docs.scrapy.org'] start_urls = ['https://docs.scrapy.org/en/latest/index.html'] def parse(self, response): docs = response.xpath('//div[@class="document"]').extract_first() print(docs) scrapy_docs = ScrapyItem() scrapy_docs["docs_scrapy"] = docs yield scrapy_docs next_page = response.xpath('//a[@rel="next"]/@href').extract_first() if next_page: next_page = urljoin(response.url, next_page) print(next_page) yield response.follow(next_page, self.parse)
urljoin
- 引入urllib.parse模組下的urljoin,
- 獲取當前頁的連結response.url
- 將當前頁的地址和下一頁的相對路徑地址拼接,從而獲取下一頁的連結