scrapy pipline 將資料存入不同的資料庫
阿新 • • 發佈:2018-12-11
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exporters import CsvItemExporter from datetime import datetime import json import pymongo import redis class ToutiaoPipeline(object): def process_item(self, item, spider): item['source'] = spider.name item['utc_time'] = str(datetime.utcnow()) return item class ToutiaoJsonPipeline(object): def open_spider(self, spider): self.filename = open("data.json", "w") def process_item(self, item, spider): content = json.dumps(dict(item)) + ",\n" self.filename.write(content) return item def close_spider(self, spider): self.filename.close() class ToutiaoCsvPipeline(object): def open_spider(self, spider): self.filename = open("data.csv", "wb") # 建立一個csv檔案讀寫物件,引數是需要儲存資料的csv檔案物件 self.csv_exporter = CsvItemExporter(self.filename) # 表示開始進行資料寫入 self.csv_exporter.start_exporting() def process_item(self, item, spider): self.csv_exporter.export_item(item) return item def close_spider(self, spider): # 表示結束資料寫入 self.csv_exporter.finish_exporting() self.filename.close() class ToutiaoMongoPipeline(object): def open_spider(self, spider): self.client = pymongo.MongoClient(host="192.168.xx.xx", port=27017) self.db = self.client['toutiao'] self.collection = self.db['content_data'] def process_item(self, item, spider): self.collection.insert(dict(item)) return item class ToutiaoRedisPipeline(object): def open_spider(self, spider): self.client = redis.Redis(host="127.0.0.1", port=6379) def process_item(self, item, spider): content = json.dumps(dict(item)) self.client.lpush("TOUTIAO_ITEM", content) return item