部署Scrapy專案
阿新 • • 發佈:2019-01-04
scrapyd操作筆記
爬蟲執行緒
pip install scrapyd
安裝依賴(自動生成egg檔案)
pip install scrapyd-client
pip install apscheduler
pip install requests
檢視所有爬蟲
curl http://localhost:6800/listspiders.json?project=VehicleOrderScrapy
檢視爬蟲狀態
curl http://localhost:6800/listjobs.json?project=VehicleOrderScrapy
開啟爬蟲
格式舉例:
curl http://localhost:6800/schedule.json -d project=myproject -d spider=somespider -d setting=DOWNLOAD_DELAY=2 -d arg1=val1
curl http://localhost:6800/schedule.json -d project=VehicleOrderScrapy -d spider=vehicle_order_86huoche -d latestUpdateDate=2018-01-23
每次更新後需要重新部署
scrapyd-deploy vehicle_order -p VehicleOrderScrapy
我現在用corn來定期執行爬蟲,貼一段半成品程式碼
#coding:utf-8 import os import time import requests from project_config import * from apscheduler.schedulers.background import BackgroundScheduler import pymysql from datetime import datetime LATEST_UPDATE_DATE = None CONNECT = None def task(): LATEST_UPDATE_DATE = getLatestUpdateDate() print('LATEST_UPDATE_DATE:'+LATEST_UPDATE_DATE) spider_list = ['vehicle_order_58','vehicle_order_ganji','vehicle_order_baixing','vehicle_order_86huoche'] # http://localhost:6800/schedule.json?project=VehicleOrderScrapy&spider=vehicle_order_86huoche&latestUpdateDate=2018-01-23 for name in spider_list: data = {'project':'VehicleOrderScrapy','spider':name,'latestUpdateDate':formatLatestUpdateDate(name,LATEST_UPDATE_DATE)} print('spider-->%s'%data) requests.post('http://localhost:6800/schedule.json', data = data) updateLatestUpdateDate(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) def formatLatestUpdateDate(spider_name,latestUpdateDate): return #格式化後的字串 def connectDB(): global CONNECT dbparams = dict( host = db_config['MYSQL_HOST'], db = db_config['MYSQL_DBNAME'], user = db_config['MYSQL_USER'], passwd = db_config['MYSQL_PASSWD'], charset = 'utf8',#編碼要加上,否則可能出現中文亂碼問題 ) try: CONNECT = pymysql.connect(**dbparams) except Exception as e: raise def closeDB(): global CONNECT CONNECT.close() def getLatestUpdateDate(): global CONNECT #檢測資料庫連線狀態,如果失聯,自動連線 CONNECT.ping(True) sql = "SELECT value FROM %s WHERE name=\'%s\'"%(db_config['MYSQL_SETTINGS_TABLENAME'],'latestUpdateDate') cursor = CONNECT.cursor() try: cursor.execute(sql) result = cursor.fetchone() if result: return result[0] else: return result except Exception as e: raise def updateLatestUpdateDate(latestUpdateDate): global CONNECT #檢測資料庫連線狀態,如果失聯,自動連線 CONNECT.ping(True) cue = CONNECT.cursor() try: sql = "UPDATE %s SET value = \'%s\' WHERE name=\'%s\'" params = (db_config['MYSQL_SETTINGS_TABLENAME'],latestUpdateDate,'latestUpdateDate') print (sql%params) cue.execute(sql%params) except Exception as e: CONNECT.rollback() raise else: CONNECT.commit() if __name__ == "__main__": scheduler = BackgroundScheduler() # 每20分鐘執行一次 connectDB() scheduler.add_job(task, 'cron', day_of_week='tue,thu,sun', hour='23') scheduler.start() print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) try: while True: time.sleep(2) except (KeyboardInterrupt, SystemExit): closeDB() scheduler.shutdown()