scrapy爬取快代理並儲存mongo資料庫
阿新 • • 發佈:2020-01-31
我們先分析下網頁
這個網友的頁面規律很簡單
https://www.kuaidaili.com/free/inha/1
https://www.kuaidaili.com/free/inha/2
這個是頁面跳轉,然後xpath的規則提取很簡單
接下來是程式碼
items.py
class url(scrapy.Item):
#抓取內容
ip=scrapy.Field()
port=scrapy.Field()
name=scrapy.Field()
time=scrapy.Field()
爬蟲主邏輯
# -*- coding: utf-8 -*- from lxml import etree import scrapy from tutorial.items import url class IpSpider(scrapy.Spider): name = 'ip' allowed_domains = ['www.kuaidaili.com/free/inha/'] start_urls = ['https://www.kuaidaili.com/free/inha/2/'] def start_requests(self): u='https://www.kuaidaili.com/free/inha/' for i in range(1,20): a=u+str(i) print(a) yield self.make_requests_from_url(a) def parse(self,response): item=url() for line in response.xpath("//table[@class='table table-bordered table-striped']/tbody/tr"): #print(line) item['ip']=line.xpath(".//td[@data-title='IP']/text()").extract()[0] item['port'] = line.xpath(".//td[@data-title='PORT']/text()").extract()[0] item['name'] = line.xpath(".//td[@data-title='位置']/text()").extract()[0] item['time'] = line.xpath(".//td[@data-title='響應速度']/text()").extract()[0] yield item
mongo資料庫儲存
import pymongo import tutorial.settings class TutorialPipeline(object): def __init__(self): host = tutorial.settings.MONGODB_HOST port = tutorial.settings.MONGODB_PORT dbname = tutorial.settings.MONGODB_DBNAME sheetname = tutorial.settings.MONGODB_SHEETNAME # 建立MONGODB資料庫連結 client = pymongo.MongoClient(host=host,port=port) # 指定資料庫 mydb = client[dbname] # 存放資料的資料庫表名 self.post = mydb[sheetname] def process_item(self,item,spider): data = dict(item) self.post.insert(data) print("successed") return item
setting設定
# -*- coding: utf-8 -*- # Scrapy settings for tutorial project # # For simplicity,this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'tutorial' SPIDER_MODULES = ['tutorial.spiders'] NEWSPIDER_MODULE = 'tutorial.spiders' # MONGODB 主機名 MONGODB_HOST = "47.××.**.**" # MONGODB 埠號 MONGODB_PORT = 27017 # 資料庫名稱 MONGODB_DBNAME = "ip" # 存放資料的表名稱 MONGODB_SHEETNAME = "agent" # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' #USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/67.0.3396.99 Safari/537.36' # Obey robots.txt rules #對不起我私人爬蟲萬事不懼 ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 4 #修改併發請求數 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #下載延遲時間 DOWNLOAD_DELAY = 3 #The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'tutorial.middlewares.TutorialSpiderMiddleware': 543,#} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'tutorial.middlewares.TutorialDownloaderMiddleware': 100,} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None,#} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'tutorial.pipelines.TutorialPipeline': 300,} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #開啟本地快取 HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 1 HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'