Python中代理伺服器的設定(requests、urllib)
阿新 • • 發佈:2019-01-29
urllib 方式
def user_proxy(proxy_addr, url): import urllib.request proxy = urllib.request.ProxyHandler({'http': proxy_addr}) opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler) urllib.request.install_opener(opener) data = urllib.request.urlopen(url).read().decode('utf-8') return data proxy_addr = "114.82.109.134:8118" data = user_proxy(proxy_addr, "https://www.baidu.com") print(data) print(len(data))
如果此時代理伺服器地址失效或者填寫錯了代理伺服器,則會發生錯誤。
requests 方式
import requests ip, port = ("111.201.81.189", "8118") url = 'http://www.163.com/' proxy_url = "http://{0}:{1}".format(ip, port) print(proxy_url) proxy_dict = { "http": proxy_url } response = requests.get(url, proxies=proxy_dict) html_doc = str(response.content, 'gbk') print(html_doc)
一個代理 ip 池的例子
將西刺免費代理IP入 MySQL 資料庫,然後每次用的時候隨機選取,如果不可用的話,再次隨機選取,直到選到為止。
MySQL 建表語句:
create database proxy_pool character set utf8;
create table proxy_ip( ip varchar(20) not null, port varchar(255) not null, speed float, proxy_type varchar(5), available char(1) default null )default charset = utf8;
Python 程式碼編寫:
import requests
from scrapy import Selector
__author__ = 'liwei'
import MySQLdb
conn = MySQLdb.connect(host='127.0.0.1', port=3306, user='root', passwd='123456', charset='utf8', db='proxy_pool')
cursor = conn.cursor()
class GetIp():
def update_available_ip(self, ip, available):
update_available_ip_sql = "update proxy_ip set available = '{0}' where ip = '{1}'".format(available, ip)
cursor.execute(update_available_ip_sql)
conn.commit()
return True
def delete_ip(self, ip):
delete_ip_sql = "delete from proxy_ip where ip = '{0}'".format(ip)
cursor.execute(delete_ip_sql)
conn.commit()
return True
def judge_ip(self, ip, port):
# 判斷給出的代理 ip 是否可用
http_url = 'http://www.163.com/'
proxy_url = 'http://{0}:{1}'.format(ip, port)
print("proxy_url", proxy_url)
try:
proxy_dict = {
'http': proxy_url
}
response = requests.get(http_url, proxies=proxy_dict)
except Exception as e:
print("[沒有返回]代理 ip {0} 及 埠號 {1} 不可用,即將從資料庫中刪除".format(ip, port))
# self.delete_ip(ip)
self.update_available_ip(ip, '0')
return False
else:
code = response.status_code
if code >= 200 or code < 300:
print("代理 ip {0} 及 埠號 {1} 可用".format(ip, port))
html_doc = str(response.content, 'gbk')
print(html_doc)
return True
else:
print("[有返回,但是狀態碼異常]代理 ip {0} 及 埠號 {1} 不可用,即將從資料庫中刪除".format(ip, port))
# self.delete_ip(ip)
self.update_available_ip(ip, '0')
return False
def get_random_ip(self):
select_random = '''
select ip,port,speed,proxy_type from proxy_ip order by rand() limit 1
'''
cursor.execute(select_random)
result = cursor.fetchone()
ip = result[0]
port = result[1]
judge_re = self.judge_ip(ip, port)
if judge_re:
self.update_available_ip(ip, '1')
return "http://{0}:{1}".format(ip, port)
else:
return self.get_random_ip()
def crawl_ips(self):
headers = {"user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1"}
for i in range(1, 2):
response = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
selector = Selector(text=response.text)
all_trs = selector.css("#ip_list tr")
ip_list = []
for tr in all_trs[1:]:
speed_str = tr.css("td[class='country']")[2]
title = speed_str.css(".bar::attr(title)").extract()[0]
if title:
pass
speed = float(title.split("秒")[0])
all_texts = tr.css("td::text").extract()
print(all_texts)
ip = all_texts[0]
port = all_texts[1]
attr = all_texts[4]
type = all_texts[5]
if attr == 'HTTPS' or attr == 'HTTP':
attr = '----------'
type = all_texts[4]
ip_list.append((ip, port, speed, type))
# 然後插入資料庫
for ip_info in ip_list:
insert_sql = '''
insert into proxy_ip(ip,port,speed,proxy_type)
values('{0}','{1}','{2}','{3}')'''.format(ip_info[0], ip_info[1], ip_info[2], ip_info[3])
print(insert_sql)
cursor.execute(insert_sql)
conn.commit()
if __name__ == '__main__':
get_ip = GetIp()
# get_ip.crawl_ips()
# 隨機地選擇一個 proxy_ip
available_ip_port = get_ip.get_random_ip()
print("可用的 ip 和埠號是:", available_ip_port)
# 使用這個 proxy_ip 去進行爬蟲