python爬蟲學習:從資料庫讀取目標爬蟲站點及爬蟲規程,批量爬取目標站點制定資料(scrapy框架)
- 資料庫databaseConfig.py
`
from urllib.parse import quote_plus
from pymongo import MongoClient
import settings
class DB:
def init(self):
# 從配置檔案總獲取資料庫連線的引數
host = settings.MONGODB_HOST
port = settings.MONGODB_PORT
dbname = settings.MONGODB_DBNAME
user_name = settings.MONGODB_USERNAME
password = settings.MONGODB_PASSWORD
# 存放爬取資料的表名 self.spider_result_sheet_name = settings.MONGODB_SAVE_SPIDER_RESULT_SHEET_NAME # 存放爬蟲目標網站資訊 self.spider_station_sheet_name = settings.MONGODB_SPIDER_STATION_SHEET_NAME # 建立MONGODB資料庫連結 uri = "mongodb://%s:%s@%s:%s" % (quote_plus(user_name), quote_plus(password), quote_plus(host), quote_plus(port)) client = MongoClient(uri) # 指定資料庫 self.collection = client[dbname]
2. 修改 scrapy 框架的 pipelines.py 檔案,新增爬蟲資料儲存到資料庫的方法
-- coding: utf-8 --
Define your item pipelines here
Don't forget to add your pipeline to the ITEM_PIPELINES setting
See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
useful for handling different item types with a single interface
import codecs
import json
import os
from MySpider.databaseConfig import DB
class MyScrapyPipeline:
def process_item(self, item, spider):
return item
# 以json檔案儲存
class JsonPipeline(object):
def process_item(self, item, spider):
# base_dir = os.getcwd()
# filename = base_dir + '/spiderData.json'
filename = 'D:/development/datas' + '/spiderData.json'
# 開啟json檔案,向裡面以dumps的方式吸入資料
# 注意需要有一個引數ensure_ascii=False ,不然資料會直接為utf編碼的方式存入比如
# :“/xe15”
with codecs.open(filename, 'a', encoding='utf-8') as f:
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
f.write(line)
return item
儲存到mongodb資料庫
class SpiderMongoPipeline(object):
def process_item(self, item, spider):
data = dict(item)
db = DB()
db.collection[db.spider_result_sheet_name].insert(data)
return item
`
- 編輯items.py 對應資料庫欄位
`
Define here the models for your scraped items
See documentation in:
https://docs.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class MyDataItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
author = Field()
release_time = Field()
url = Field()
create_time = Field()
# pass
`
- 核心爬蟲方法mySpider.py
`
coding=utf-8
import time
import scrapy
from scrapy.selector import Selector
from mySpider.databaseConfig import DB
from mySpider.items import MyDataItem
class MySpider(scrapy.Spider):
name = 'mySpider' # 爬蟲的唯一標識,不能重複,啟動爬蟲的時候要用
# 重寫Scrapy的start_requests方法
def start_requests(self):
# 資料庫連線
collection = DB().collection[DB().spider_station_sheet_name]
items = collection.find() # 從資料庫中查詢所有需要爬取的站點資訊
for item in items:
station_url = item["station_url"] # 目標站點url
yield scrapy.Request(url=station_url, meta=item, callback=self.parse_station)
# 站點爬蟲方法
def parse_station(self, response):
meta = response.meta # 從請求上獲取手動傳入的meta引數
articles = Selector(response).xpath(meta["table_xpath"]) # 獲取到文章列表
for article in articles:
article_detail_url = meta["station_root_url"] + article.xpath(meta["article_detail_xpath"]).extract()[0]
yield scrapy.Request(url=article_detail_url, meta=meta, callback=self.parse_detail, dont_filter=True) # dont_filter=True 不過濾,不然會導致parse_detail只執行一次
# 爬取詳情頁
def parse_detail(self, response):
items = MyDataItem()
current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
meta = response.meta
selector = Selector(response)
items['title'] = selector.xpath(meta["title_xpath"]).extract()[0]
items['author'] = meta["station_name"] if meta["author_xpath"] == "" else selector.xpath(meta["author_xpath"]).extract()[0]
items['release_time'] = selector.xpath(meta["release_time_xpath"]).extract()[0]
items['url'] = response.url
items['create_time'] = current_time
yield items # 提交爬蟲資訊(到pipelines.py)
`
- settings.py 修改(以下為settings.py的部分配置內容)
`
BOT_NAME = 'mySpider'
SPIDER_MODULES = ['myScrapy.spiders'] # 爬蟲核心方法所在的專案檔案路徑(從專案根開始)
NEWSPIDER_MODULE = 'myScrapy.spiders'
Obey robots.txt rules
ROBOTSTXT_OBEY = True
LOG_LEVEL = 'ERROR'
Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
'Accept-Language': 'en',
}
ITEM_PIPELINES = {
'myScrapy.pipelines.SpiderMongoPipeline': 200
}
以下mongodb資料庫配置資訊省略
`
- 啟動類main.py
`
方法一:執行單一爬蟲模組,並根據pipeline.py的配置儲存
from scrapy import cmdline
cmdline.execute("scrapy crawl recruit".split())
方法二:執行單一爬蟲模組,以檔案形式儲存(在當前專案根)
cmdline.execute("scrapy crawl recruit -o rsj.json".split())
方法三: 批量制定執行爬蟲模組
批量方法1
cmdline.execute("scrapy crawlProcess rsj cqgsdx".split())
批量方法2
cmdline.execute(['scrapy', 'crawl', 'recruit'])
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
方法四:批量執行spider
process = CrawlerProcess(get_project_settings())
didntWorkSpider = ['rsj', 'cqgsdx'] # 不需要執行的spider模組
process_spider_list = process.spiders.list() # 取spiders路徑下所有的spider模組
for the_spider_name in process_spider_list:
if the_spider_name in didntWorkSpider:
continue
print("Running spider %s" % (the_spider_name))
process.crawl(the_spider_name)
process.start()
`