1. 程式人生 > 實用技巧 >爬蟲代理池,百萬資料輕鬆抓取。

爬蟲代理池,百萬資料輕鬆抓取。

1.今天我們來講下一個非常有用的東西,代理ip池,結果就是一個任務每隔一定時間去到目標ip代理提供網站去爬取可用資料存到mysql資料庫,並且檢測資料庫已有資料是否可用,不可用就刪除。
2. 編寫 提取代理ip到資料庫 的爬蟲
2.1準備mysql表

CREATE TABLE `t_ips` (
`id` int(10) NOT NULL AUTO_INCREMENT COMMENT '主鍵',
`ip` varchar(15) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT 'ip',
`port` int(10) NOT NULL COMMENT 'port
', `type` int(10) NOT NULL DEFAULT '0' COMMENT '0:http 1:https', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=421 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci COMMENT='ip表';

2.2建立爬蟲工程,編寫items.py(對應資料庫的欄位)

import scrapy
class IpsItem(scrapy.Item):
# define the fields for your item here like:
# name 
= scrapy.Field() ip = scrapy.Field() port = scrapy.Field() httpType = scrapy.Field()

2.3編寫settings.py

# -*- coding: utf-8 -*-
####################自已的配置################
MAX_PAGE = 2 ##抓取的代理ip網址 的 頁數
#0 : http 1:https
TYPE = 0 ### 代理ip型別
URL = 'http://www.bugng.com/gnpt?page=' ### 代理ip網址
TIMER_STOP_TIME 
= 20 ### 定時器暫停執行時間 ##################################### BOT_NAME = 'ips' SPIDER_MODULES = ['ips.spiders'] NEWSPIDER_MODULE = 'ips.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' ITEM_PIPELINES = { 'ips.pipelines.IpsPipeline': 300, } # 禁止重試 RETRY_ENABLED = False # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'csdn (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # 減小下載超時: DOWNLOAD_TIMEOUT = 2 # 禁止cookies: COOKIES_ENABLED = False # 延遲下載 防止被ban DOWNLOAD_DELAY=2

2.4編寫spider

這裡用到了bs4,需要自行安裝
# -*- coding: utf-8 -*-
import scrapy
import logging
from bs4 import BeautifulSoup
from ips.items import IpsItem
from ips.settings import *
class XicispiderSpider(scrapy.Spider):
name = 'xiciSpider'
allowed_domains = ['xicidaili.com']
start_urls = ['http://xicidaili.com/']
### 開始 放入url
def start_requests(self):
req = []
for i in range(1,MAX_PAGE):
### 代理ip網址的第幾頁的 url
req.append(scrapy.Request(URL + str(i-1)))
return req
## 每一頁url的 解析回撥函式,利用bs4解析
def parse(self, response):
print('@@@@@@@@@ 開始解析 '+response.url)
try:
soup = BeautifulSoup(str(response.body, encoding = "utf-8"),'html.parser')
trs = soup.find('table',{'class':'table'}).find_all('tr')
for tr in trs[1:]:
tds = tr.find_all('td')
cur = 0
item = IpsItem()
item['httpType'] = TYPE
for td in tds:
if cur == 0:
item['ip'] = td.text
if cur == 1:
item['port'] = td.text
cur = cur +1
yield item #### 給pipline處理
except Exception as e:
logging.log(logging.WARN, '@@@@@@@@@ start parser ' + str(e))

2.5編寫pipline

這裡需要安裝 : pip install mysqlclient

這裡插入資料庫之前做兩個校驗:

1.資料是否存在

2.資料是否可用

# -*- coding: utf-8 -*-
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
import logging
import requests
class IpsPipeline(object):
def __init__(self):
dbargs = dict(
host='你的資料庫ip',
db='資料庫名稱',
user='root',
passwd='資料庫密碼',
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
self.dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
##處理每個yeild的item
def process_item(self, item, spider):
res = self.dbpool.runInteraction(self.insert_into_table, item)
return item
def insert_into_table(self, conn, item):
ip = item['ip']
port = item['port']
# 先查詢存不存在
if self.exsist(item,conn):
return
# 查詢 此代理ip是否可用,可用就加入資料庫
if self.proxyIpCheck(item['ip'],item['port']) is False:
print("此代理ip不可用,proxy:",item['ip'],':',str(item['port']))
return
sql = 'insert into t_ips (ip,port,type) VALUES ('
sql = sql + '"' + item['ip'] + '",'
sql = sql + str(item['port']) + ','
sql = sql + str(item['httpType']) + ','
sql = sql[0:-1]
sql = sql + ')'
try:
conn.execute(sql)
print(sql)
except Exception as e:
logging.log(logging.WARNING, "sqlsqlsqlsqlsqlsqlsql error>> " + sql)
def exsist(self,item,conn):
sql = 'select * from t_ips where ip="' + item['ip'] + '" and port=' + str(item['port']) + ''
try:
# 執行SQL語句
conn.execute(sql)
# 獲取所有記錄列表
results = conn.fetchall()
if len(results) > 0: ## 存在
#print("此ip已經存在@@@@@@@@@@@@")
return True
except:
return False
return False
##判斷代理ip是否可用
def proxyIpCheck(self,ip, port):
server = ip + ":" + str(port)
proxies = {'http': 'http://' + server, 'https': 'https://' + server}
try:
r = requests.get('https://www.baidu.com/', proxies=proxies, timeout=1)
if (r.status_code == 200):
return True
else:
return False
except:
return False

2.6 測試爬蟲 scrapy crwal 爬蟲名

3. 到此我們的 提取代理ip到資料庫的 爬蟲就寫好了,接下來就是我們的任務定時器的編寫

#####在我們的爬蟲專案的settings.py檔案的同級目錄新建一個start.py檔案

import os
import pymysql
import threading
from settings import *
##定時器呼叫的run方法
def run():
clearIpPool()
### 迴圈定時器,不然執行一次就over了
timer = threading.Timer(TIMER_STOP_TIME, run)
timer.start()
########從這裡開始執行
print("ip池定時器開始,間隔時間:",str(TIMER_STOP_TIME),'s')
########開啟定時器 TIMER_STOP_TIME為settings.py中的配置
timer = threading.Timer(TIMER_STOP_TIME,run)
timer.start()
def clearIpPool():
print("定時器執行,清掃ip資料庫池")
## 利用 系統scrapy命令重新爬取代理ip
os.system('scrapy crawl xiciSpider --nolog')
# 遍歷資料庫 去除無用的代理ip
removeUnSafeProxyFromDB()
print("定時器執行完畢")
###### 查詢資料庫,找出無用的代理ip並且刪除
def removeUnSafeProxyFromDB():
# 開啟資料庫連線
db = pymysql.connect("39.108.112.254", "root", "abc123|||456", "xici")
# 使用cursor()方法獲取操作遊標
cursor = db.cursor()
# SQL 查詢語句
sql = "SELECT * FROM t_ips"
try:
# 執行SQL語句
cursor.execute(sql)
# 獲取所有記錄列表
results = cursor.fetchall()
for row in results:
id = row[0]
ip = row[1]
port = row[2]
if proxyIpCheck(ip, str(port)) is False:
print("此代理ip不可用,proxy:",ip, ':', str(port))
## 執行刪除
sql = "DELETE FROM t_ips WHERE id = "+str(id)
# 執行SQL語句
cursor.execute(sql)
print(sql)
# 提交修改
db.commit()
return
except:
print("Error: unable to fetch data")
# 關閉資料庫連線
db.close()
#####檢測代理ip是否可用
def proxyIpCheck(ip, port):
server = ip + ":" + str(port)
proxies = {'http': 'http://' + server, 'https': 'https://' + server}
try:
r = requests.get('https://www.baidu.com/', proxies=proxies, timeout=1)
if (r.status_code == 200):
return True
else:
return False
except:
return False