爬蟲練習之遞迴爬取入口頁面下所有連結(scrapy-redis分散式)
阿新 • • 發佈:2019-02-19
1. 實現scrapy-redis前的一些準備
- pycharm中安裝scrapy和scrapy-redis模組
- pycharm中開啟scrapy-redis原始碼所在資料夾
- 同scrapy用法,修改四個檔案items, settings, pipelines 和自定義的爬蟲程式碼dmoz
2. scrapy-redis與scrapy區別
利用redis實現分散式爬蟲
排程器Scheduler
- scrapy
- 改寫Python雙向佇列為自己的優先順序佇列,但是scapry中存在多個spider時不能共享同一個待爬佇列
- scrapy-redis
- 將scarpy佇列放到redis資料庫中讀取,實現多個爬蟲共享一個佇列
- 同時還支援使用FIFO佇列和LIFO佇列
去除重複Duplication Filter
- scrapy
- 使用集合實現去重
- 將已傳送請求的指紋存入集合,新發送請求時與該集合比對判斷是否已請求過
- scrapy-redis
- redis的zset具有不重複的特點
- 將指紋存入redis,將不重複的請求寫入請求佇列
資料管道Item Pipeline
- scrapy
爬取到的資料直接傳給管道檔案 - scrapy-redis
將爬取到的資料存入redis資料佇列,可實現items processes叢集
爬蟲引擎Base Spider
- scrapy
Spider類 - scrapy-redis
繼承Spider類和RedisMixin類,從redis讀取url
3. 程式碼部分
settings
# Scrapy settings for lagou project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['lagou.spiders']
NEWSPIDER_MODULE = 'lagou.spiders'
#USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'
# 本地重複過濾
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 計劃排程器,將請求佇列處理分發
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否將本地請求佇列持久化到遠端伺服器
SCHEDULER_PERSIST = True
# 使用框架提供的佇列
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"# 常用,優先順序佇列
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"# FIFO佇列,先進先出
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"# LIFO佇列,後進先出
ITEM_PIPELINES = {
'lagou.pipelines.lagouPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# 日誌級別
# LOG_LEVEL = 'DEBUG'
# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
# 爬取間隔
DOWNLOAD_DELAY = 30
# 請求頭
DEFAULT_REQUEST_HEADERS = {
'Referer': 'https://www.lagou.com/jobs/list_%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
}
# COOKIES不用
COOKIES_ENABLED = False
# 機器人規則不遵守
ROBOTSTXT_OBEY = False
# 重試
RETRY_ENABLE = True
RETRY_TIMES = 5 # 重試次數,次
DOWNLOAD_TIMEOUT = 5 # 超時時長,秒
# 連線遠端redis服務,可連線redis叢集實現分散式
REDIS_HOST = '10.25.34.65'
REDIS_PORT = 6379
items
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
class ExampleItem(Item):
# 框架預設欄位
name = Field()
description = Field()
link = Field()
crawled = Field()
spider = Field()
url = Field()
# 自定義欄位
positionName = Field()
companyFullName = Field()
companyShortName = Field()
companySize = Field()
financeStage = Field()
district = Field()
education = Field()
workYear = Field()
salary = Field()
positionAdvantage = Field()
class ExampleLoader(ItemLoader):
default_item_class = ExampleItem
default_input_processor = MapCompose(lambda s: s.strip())
default_output_processor = TakeFirst()
description_out = Join()
pipelines
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
from datetime import datetime
import pandas
class lagouPipeline(object):
def process_item(self, item, spider):
# 框架預設
item["crawled"] = datetime.utcnow()
item["spider"] = spider.name
# 自定義
positionName = item['positionName']
companyFullName = item['companyFullName']
companyShortName = item['companyFullName']
companySize = item['companyFullName']
financeStage = item['companyFullName']
district = item['companyFullName']
education = item['companyFullName']
workYear = item['companyFullName']
salary = item['companyFullName']
positionAdvantage = item['companyFullName']
data=[companyFullName,companyShortName,companySize,financeStage,district,positionName
,workYear,education,salary,positionAdvantage]
columns=['公司全名', '公司簡稱', '公司規模', '融資階段', '區域', '職位名稱', '工作經驗', '學歷要求', '工資', '職位福利']
df=pandas.DataFrame(data=data,index=None,columns=columns)
df.to_csv('北京-機器學習.csv',index=None)
return item
自定義爬蟲程式碼dmoz
import json
import math
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from lagou.items import ExampleItem
class DmozSpider(CrawlSpider):
name = 'dmoz'
allowed_domains = ['www.lagou.com']
start_urls=['https://www.lagou.com/jobs/positionAjax.json?px=default&city=北京&needAddtionalResult=false']
# rules = [
# Rule(LinkExtractor(
# allow=(r'支援正則表示匹配爬蟲域www.lagou.com內所有連結')
# ), callback='start_requests', follow=True),
# ]
def start_requests(self):
print('start_requests--------------------------------------------------------')
url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=北京&needAddtionalResult=false'
yield scrapy.FormRequest(
url= url,
formdata={
'first': 'true',
'pn': '1',
'kd': '機器學習'
},
callback=self.get_pagenum,
)
def get_pagenum(self,response):
# 確定總頁數
meta = json.loads(response.body)
print(meta)
jobnum = meta['content']['positionResult']['totalCount']
pagedemo=math.ceil(jobnum / 15)
if pagedemo>30:
pagenum=30
else:
pagenum=pagedemo
print(f'總頁數:{pagenum}')
url = response.url
for num in range(1,pagenum+1):
yield scrapy.FormRequest(
url= url,
formdata={
'first': 'true',
'pn': str(num),
'kd': '機器學習'
},
callback=self.get_message,
)
def get_message(self,response):
# json.loads獲取json資料列表
meta=json.loads(response.body)
print(f'meta:{meta}')
item = ExampleItem()
joblist = meta['content']['positionResult']['result']
for job in joblist:
item['positionName'] = job['positionName']
item['companyFullName'] = job['companyFullName']
item['companyShortName'] = job['companyShortName']
item['companySize'] = job['companySize']
item['financeStage'] = job['financeStage']
item['district'] = job['district']
item['education'] = job['education']
item['workYear'] = job['workYear']
item['salary'] = job['salary']
item['positionAdvantage'] = job['positionAdvantage']