1. 程式人生 > >python_爬蟲_模塊

python_爬蟲_模塊

res fan 文件 oot per HERE 調用 mysq use

import pymysql
from urllib import request,parse
from urllib.error import HTTPError,URLError

def main(url,headers=None,data=None): # 調用函數
    if not data:
        return get_response(url,headers=headers)
    else:
        return get_response(url,headers=headers,data=data)

def get_response(url,data=None,headers=None):
    
if not headers: headers = {User-Agent:get_agent()} try: if data: data = parse.urlencode(data) data = bytes(data,encoding=utf-8) req = request.Request(url, data=data, headers=headers) else: req = request.Request(url,headers=headers) response
= request.urlopen(req) data = response.read().decode() return data # 返回數據 except HTTPError as e: # 總的錯誤信息,不適合用於調試 print(e) except URLError as e: print(e) def get_agent(table=None): # 提前使用fake_useragent模塊生成的請求頭,存儲在數據庫中,避免出現問題無法調用fake_useragent模塊 table =
p_useragent conn = pymysql.connect(127.0.0.1, root, 123456, PaChong, charset=utf8) cursor = conn.cursor() # 連接數據庫,隨機調用請求頭 sql = SELECT * FROM {} WHERE id >= ((SELECT MAX(Id) FROM {})-(SELECT MIN(Id) FROM {})) * RAND() + (SELECT MIN(Id) FROM p_useragent) LIMIT 1.format( table, table, table) rwo = cursor.execute(sql) useragent = cursor.fetchall()[0][1] return useragent if __name__ == __main__: url = http://fanyi.baidu.com/sug data = {kw:中國} import json res = json.loads(main(url,data=data)) print(res) # url = ‘http://www.baidu.com‘ # res = main(url) # print(res)

正常情況下,每寫一個爬蟲,都需要執行分析->請求->響應->下載(存儲)的流程,但諸多功能,其實都是在重復造輪子,比如請求、調用請求頭、post請求data值,可以將這些功能寫到一個py文件裏,這樣再寫其他爬蟲文件時, 直接調用,就可以略過輸入請求頭、post傳參轉碼等諸多操作。

python_爬蟲_模塊