爬蟲代理池,百萬資料輕鬆抓取。
阿新 • • 發佈:2020-08-27
1.今天我們來講下一個非常有用的東西,代理ip池,結果就是一個任務每隔一定時間去到目標ip代理提供網站去爬取可用資料存到mysql資料庫,並且檢測資料庫已有資料是否可用,不可用就刪除。
2. 編寫 提取代理ip到資料庫 的爬蟲
2.1準備mysql表
CREATE TABLE `t_ips` ( `id` int(10) NOT NULL AUTO_INCREMENT COMMENT '主鍵', `ip` varchar(15) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT 'ip', `port` int(10) NOT NULL COMMENT 'port', `type` int(10) NOT NULL DEFAULT '0' COMMENT '0:http 1:https', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=421 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci COMMENT='ip表';
2.2建立爬蟲工程,編寫items.py(對應資料庫的欄位)
import scrapy class IpsItem(scrapy.Item): # define the fields for your item here like: # name= scrapy.Field() ip = scrapy.Field() port = scrapy.Field() httpType = scrapy.Field()
2.3編寫settings.py
# -*- coding: utf-8 -*- ####################自已的配置################ MAX_PAGE = 2 ##抓取的代理ip網址 的 頁數 #0 : http 1:https TYPE = 0 ### 代理ip型別 URL = 'http://www.bugng.com/gnpt?page=' ### 代理ip網址 TIMER_STOP_TIME= 20 ### 定時器暫停執行時間 ##################################### BOT_NAME = 'ips' SPIDER_MODULES = ['ips.spiders'] NEWSPIDER_MODULE = 'ips.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' ITEM_PIPELINES = { 'ips.pipelines.IpsPipeline': 300, } # 禁止重試 RETRY_ENABLED = False # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'csdn (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # 減小下載超時: DOWNLOAD_TIMEOUT = 2 # 禁止cookies: COOKIES_ENABLED = False # 延遲下載 防止被ban DOWNLOAD_DELAY=2
2.4編寫spider
這裡用到了bs4,需要自行安裝 # -*- coding: utf-8 -*- import scrapy import logging from bs4 import BeautifulSoup from ips.items import IpsItem from ips.settings import * class XicispiderSpider(scrapy.Spider): name = 'xiciSpider' allowed_domains = ['xicidaili.com'] start_urls = ['http://xicidaili.com/'] ### 開始 放入url def start_requests(self): req = [] for i in range(1,MAX_PAGE): ### 代理ip網址的第幾頁的 url req.append(scrapy.Request(URL + str(i-1))) return req ## 每一頁url的 解析回撥函式,利用bs4解析 def parse(self, response): print('@@@@@@@@@ 開始解析 '+response.url) try: soup = BeautifulSoup(str(response.body, encoding = "utf-8"),'html.parser') trs = soup.find('table',{'class':'table'}).find_all('tr') for tr in trs[1:]: tds = tr.find_all('td') cur = 0 item = IpsItem() item['httpType'] = TYPE for td in tds: if cur == 0: item['ip'] = td.text if cur == 1: item['port'] = td.text cur = cur +1 yield item #### 給pipline處理 except Exception as e: logging.log(logging.WARN, '@@@@@@@@@ start parser ' + str(e))
2.5編寫pipline
這裡需要安裝 : pip install mysqlclient
這裡插入資料庫之前做兩個校驗:
1.資料是否存在
2.資料是否可用
# -*- coding: utf-8 -*- import MySQLdb import MySQLdb.cursors from twisted.enterprise import adbapi import logging import requests class IpsPipeline(object): def __init__(self): dbargs = dict( host='你的資料庫ip', db='資料庫名稱', user='root', passwd='資料庫密碼', charset='utf8', cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True, ) self.dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs) ##處理每個yeild的item def process_item(self, item, spider): res = self.dbpool.runInteraction(self.insert_into_table, item) return item def insert_into_table(self, conn, item): ip = item['ip'] port = item['port'] # 先查詢存不存在 if self.exsist(item,conn): return # 查詢 此代理ip是否可用,可用就加入資料庫 if self.proxyIpCheck(item['ip'],item['port']) is False: print("此代理ip不可用,proxy:",item['ip'],':',str(item['port'])) return sql = 'insert into t_ips (ip,port,type) VALUES (' sql = sql + '"' + item['ip'] + '",' sql = sql + str(item['port']) + ',' sql = sql + str(item['httpType']) + ',' sql = sql[0:-1] sql = sql + ')' try: conn.execute(sql) print(sql) except Exception as e: logging.log(logging.WARNING, "sqlsqlsqlsqlsqlsqlsql error>> " + sql) def exsist(self,item,conn): sql = 'select * from t_ips where ip="' + item['ip'] + '" and port=' + str(item['port']) + '' try: # 執行SQL語句 conn.execute(sql) # 獲取所有記錄列表 results = conn.fetchall() if len(results) > 0: ## 存在 #print("此ip已經存在@@@@@@@@@@@@") return True except: return False return False ##判斷代理ip是否可用 def proxyIpCheck(self,ip, port): server = ip + ":" + str(port) proxies = {'http': 'http://' + server, 'https': 'https://' + server} try: r = requests.get('https://www.baidu.com/', proxies=proxies, timeout=1) if (r.status_code == 200): return True else: return False except: return False
2.6 測試爬蟲 scrapy crwal 爬蟲名
3. 到此我們的 提取代理ip到資料庫的 爬蟲就寫好了,接下來就是我們的任務定時器的編寫
#####在我們的爬蟲專案的settings.py檔案的同級目錄新建一個start.py檔案
import os import pymysql import threading from settings import * ##定時器呼叫的run方法 def run(): clearIpPool() ### 迴圈定時器,不然執行一次就over了 timer = threading.Timer(TIMER_STOP_TIME, run) timer.start() ########從這裡開始執行 print("ip池定時器開始,間隔時間:",str(TIMER_STOP_TIME),'s') ########開啟定時器 TIMER_STOP_TIME為settings.py中的配置 timer = threading.Timer(TIMER_STOP_TIME,run) timer.start() def clearIpPool(): print("定時器執行,清掃ip資料庫池") ## 利用 系統scrapy命令重新爬取代理ip os.system('scrapy crawl xiciSpider --nolog') # 遍歷資料庫 去除無用的代理ip removeUnSafeProxyFromDB() print("定時器執行完畢") ###### 查詢資料庫,找出無用的代理ip並且刪除 def removeUnSafeProxyFromDB(): # 開啟資料庫連線 db = pymysql.connect("39.108.112.254", "root", "abc123|||456", "xici") # 使用cursor()方法獲取操作遊標 cursor = db.cursor() # SQL 查詢語句 sql = "SELECT * FROM t_ips" try: # 執行SQL語句 cursor.execute(sql) # 獲取所有記錄列表 results = cursor.fetchall() for row in results: id = row[0] ip = row[1] port = row[2] if proxyIpCheck(ip, str(port)) is False: print("此代理ip不可用,proxy:",ip, ':', str(port)) ## 執行刪除 sql = "DELETE FROM t_ips WHERE id = "+str(id) # 執行SQL語句 cursor.execute(sql) print(sql) # 提交修改 db.commit() return except: print("Error: unable to fetch data") # 關閉資料庫連線 db.close() #####檢測代理ip是否可用 def proxyIpCheck(ip, port): server = ip + ":" + str(port) proxies = {'http': 'http://' + server, 'https': 'https://' + server} try: r = requests.get('https://www.baidu.com/', proxies=proxies, timeout=1) if (r.status_code == 200): return True else: return False except: return False