1. 程式人生 > >部署Scrapy專案

部署Scrapy專案

scrapyd操作筆記

爬蟲執行緒

pip install scrapyd

安裝依賴(自動生成egg檔案)

pip install scrapyd-client
pip install apscheduler
pip install requests

檢視所有爬蟲

curl http://localhost:6800/listspiders.json?project=VehicleOrderScrapy

檢視爬蟲狀態

curl http://localhost:6800/listjobs.json?project=VehicleOrderScrapy

開啟爬蟲

格式舉例:
curl

http://localhost:6800/schedule.json -d project=myproject -d spider=somespider -d setting=DOWNLOAD_DELAY=2 -d arg1=val1

curl http://localhost:6800/schedule.json -d project=VehicleOrderScrapy -d spider=vehicle_order_86huoche -d latestUpdateDate=2018-01-23

每次更新後需要重新部署

scrapyd-deploy vehicle_order -p VehicleOrderScrapy

我現在用corn來定期執行爬蟲,貼一段半成品程式碼

#coding:utf-8
import os
import time
import requests
from project_config import *
from apscheduler.schedulers.background import BackgroundScheduler
import pymysql
from datetime import datetime

LATEST_UPDATE_DATE = None
CONNECT = None

def task():
    LATEST_UPDATE_DATE = getLatestUpdateDate()
    print('LATEST_UPDATE_DATE:'+LATEST_UPDATE_DATE)

    spider_list = ['vehicle_order_58','vehicle_order_ganji','vehicle_order_baixing','vehicle_order_86huoche']
    # http://localhost:6800/schedule.json?project=VehicleOrderScrapy&spider=vehicle_order_86huoche&latestUpdateDate=2018-01-23
    for name in spider_list:
        data = {'project':'VehicleOrderScrapy','spider':name,'latestUpdateDate':formatLatestUpdateDate(name,LATEST_UPDATE_DATE)}
        print('spider-->%s'%data)
        requests.post('http://localhost:6800/schedule.json', data = data)
    updateLatestUpdateDate(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))

def formatLatestUpdateDate(spider_name,latestUpdateDate):
    return #格式化後的字串
def connectDB():
    global CONNECT

    dbparams = dict(
                host = db_config['MYSQL_HOST'],
                db = db_config['MYSQL_DBNAME'],
                user = db_config['MYSQL_USER'],
                passwd = db_config['MYSQL_PASSWD'],
                charset = 'utf8',#編碼要加上,否則可能出現中文亂碼問題
                )
    try:
        CONNECT = pymysql.connect(**dbparams)
    except Exception as e:
        raise

def closeDB():
    global CONNECT
    CONNECT.close()

def getLatestUpdateDate():
    global CONNECT

    #檢測資料庫連線狀態,如果失聯,自動連線
    CONNECT.ping(True)

    sql = "SELECT value FROM %s WHERE name=\'%s\'"%(db_config['MYSQL_SETTINGS_TABLENAME'],'latestUpdateDate')
    cursor = CONNECT.cursor()
    try:
        cursor.execute(sql)
        result = cursor.fetchone()
        if result:
            return result[0]
        else:
            return result
    except Exception as e:
        raise

def updateLatestUpdateDate(latestUpdateDate):
    global CONNECT
    #檢測資料庫連線狀態,如果失聯,自動連線
    CONNECT.ping(True)

    cue = CONNECT.cursor()
    try:
        sql = "UPDATE %s SET value = \'%s\' WHERE name=\'%s\'"
        params = (db_config['MYSQL_SETTINGS_TABLENAME'],latestUpdateDate,'latestUpdateDate')
        print (sql%params)
        cue.execute(sql%params)
    except Exception as e:
        CONNECT.rollback()
        raise
    else:
        CONNECT.commit()

if __name__ == "__main__":
    scheduler = BackgroundScheduler()
    # 每20分鐘執行一次
    connectDB()
    scheduler.add_job(task, 'cron', day_of_week='tue,thu,sun', hour='23')
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
    try:
        while True:
            time.sleep(2)
    except (KeyboardInterrupt, SystemExit):
        closeDB()
        scheduler.shutdown()

參考資料

scrapyd和scrapyd-client使用教程