python Scrapy 爬蟲例項
阿新 • • 發佈:2018-12-05
https://www.jianshu.com/p/78f0bc64feb8
1.新建專案
scrapy startproject cnblog
2.pycharm 開啟專案
image.png
3.新建spider
image.png
新建main.py
from scrapy import cmdline
cmdline.execute("scrapy crawl cnblog".split())
爬蟲程式碼
import scrapy from cnblog.items import CnblogItem class Cnblog_Spider(scrapy.Spider): name = "cnblog" allowed_domains = ["cnblog.com"] start_urls = [ 'https://www.cnblogs.com/', ] def parse(self, response): item = CnblogItem() item['title'] = response.xpath('//a[@class="titlelnk"]/text()').extract() item['link'] = response.xpath('//a[@class="titlelnk"]/@href').extract() yield item
item程式碼
import scrapy
class CnblogItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
setting
BOT_NAME = 'cnblog' SPIDER_MODULES = ['cnblog.spiders'] NEWSPIDER_MODULE = 'cnblog.spiders' DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', #user-agent新新增 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } #新修改 ITEM_PIPELINES = { 'cnblog.pipelines.FilePipeline': 300, #實現儲存到txt檔案 'cnblog.pipelines.mysqlPipeline': 300, # 實現儲存到mysql }
4.儲存成text
class FilePipeline(object): def process_item(self, item, spider): data = '' with open('cnblog.txt', 'w', encoding='utf-8') as f: titles = item['title'] links = item['link'] for i, j in zip(titles, links): data += i + ':'+j+'\n' f.write(data) f.close() return item