Scrapy Spider MiddleWare 設定
阿新 • • 發佈:2018-12-11
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# # See documentation in:
#
https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from newrecord.settings import NOW_Y, NOW_M, NOW_D, YES_Y, YES_M, YES_D
from scrapy import signals
import time
import base64 # DownloadMiddleware # 0 47 167 寶藍色RGB
# 在process_request downloadmiddleware 中新增代理
# proxy_user_pass = 'USERNAME:PASSWORD'
# encoded_user_pass = base64.b64encode(proxy_user_pass)
# request.headers['Proxy-Authorization']='Basic'+encoded_user_passwd
# request.meta['proxy']='IP:PORT'
class NewrecordSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# 進入spider 的response 的資料
# 和 downloadmiddle裡面的process_response類似
# 先去downloadmiddleware那裡的process_response 再到這裡處理:
def process_spider_input(self, response, spider):
# 處理進入spider中的response資料,但返回的是None
print('-----------------------3--------------------')
# 對response篩選之後不能阻止進入spider 啥用?try--except---Exception,
print('---進入spidermiddleware----process_spider_input------response.url----%s--------'%(response.url))
# Called for each response that goes through the spider
try:
# middleware and into the spider.
# Should return None or raise an exception.
return None
except Exception as e:
print(e)
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
# result :經過parse_item 處理過後的輸出結果,等於item資料也可以在這裡處理,不過是在Pipline處理過後的資料
# parse_item 輸出的結果先進入pipeline管道里去處理item資料最後回到process_spider_output這裡,再就是關閉spider:
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
# 處理start_urls 後面的url無關: 否則這方法不會執行,只能是start_urls引數
# 並且def 裡面的東西只能是process_start_requests
# 處理start_urls 與後面的url無關:
def process_start_requests(self, start_urls, spider):
# Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only start_urls (not items).
for r in start_urls:
if str(r).find('rank_news') >= 0:
print('---------------------0-----------------------------')
print('-------------------進入Spider MiddleWare裡面的開始爬去網頁url-----------start_requests===:%s', r)
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s ' %spider.name)
class NewrecordDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# Proxy-Authorization base64代理賬戶驗證
# request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT"
# encoded_user_pass = base64.b64encode(proxy_user_pass)
# request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
# request.meta['proxy'] = ['127.0.0.1:8000']
# request.meta['item']='' 在request meta 資料裡面增加資料 可以用來傳參
# request(url, meta['item']=item[], callback= '')
# request.cookies['']='' 往request裡面增加cookies
def process_request(self, request, spider):
print('---------------1------------------')
print('----------------進入DownloadMiddleWare中的request的url是:%s----------------' %(request.url))
return None
# return None: continue processing this exception
# return a Response object: stops process_exception() chain
# return a Request object: stops process_exception() chain
def process_response(self, request, response, spider):
# 處理所有爬過的網站的response,通過response.url 可以篩選
print('-----------------------------2---------------------------------')
# 需要的爬取的網址,但這個在Rules裡面更方便
print('----------------進入DownloadMiddleWare中的response的url是:%s----------------' %(response.url))
return response
# 返回的response 進入spider 中的process_spider_input
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
121 1,19 頂端