python scrapy 基本操作演示程式碼
阿新 • • 發佈:2018-11-08
# -*- coding: utf-8 -*- import scrapy # from quotetutorial.items import QuoteItem from quotetutorial.items import QuotetutorialItem # 主要編輯專案資訊基本上都在在這裡完成的 class QuotesSpider(scrapy.Spider): name = 'quotes' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] # 爬取資訊 def parse(self, response): # pass # 列印原始碼 # print(response.text) quotes = response.css('.col-md-8 .quote') for quote in quotes: item = QuotetutorialItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() # 只提取一個內容 類似於 findone tags = quote.css('.tags .tag::text').extract() # 提多多個內容 類似於 findall item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .netxt a::attr(href)').extract_first() # 選擇下一頁 url = response.urljoin(next) # 因為獲取的地址不完整,獲取完整的網址加內容連線地址 yield scrapy.Request(url=url,callback=self.parse()) # 從新呼叫自己並翻頁 # 儲存檔案 # scrapy crawl quotes -o quotes.json # scrapy crawl quotes -o quotes.jl # scrapy crawl quotes -o quotes.csv # scrapy crawl quotes -o quotes.xml # scrapy crawl quotes -o ftp://user:[email protected]/path/quotes.csv