1. 程式人生 > >用python爬取網貸之家p2p平臺數據

用python爬取網貸之家p2p平臺數據

網貸之家中的p2p平臺數據比較容易獲取,重要的就是如何分析網頁的原始碼然後從裡面提取自己需要的資訊,也不需要使用者登入,該網站的爬蟲比較簡單,主要用了urllib包來獲取網頁資訊,用BeautifulSoup來解析網頁,最後用正則表示式提取資料。這裡就直接上原始碼了:

# -*- coding: utf-8 -*-
"""
Created on Wed Aug  8 18:22:26 2018

@author: 95647
"""
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import
re import pandas as pd headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} lists =[] domains = "https://www.wdzj.com" def get_platform_site(url,lists): """獲取所有的平臺網址""" # global lists req = urllib.request.Request(url, headers=headers) html = urlopen(req) bsObj = BeautifulSoup(html,'lxml'
) title = bsObj.findAll("div",{'class':'itemTitle'}) for titles in title: links = titles.findAll("a",{'target':'_blank'}) for link in links: if 'href' in link.attrs: lists.append(link.attrs['href']) # print(link.attrs['href']) return
lists #用utf-8進行解碼 def pages_num(url): """獲取各類平臺的頁面總數""" req = urllib.request.Request(url, headers=headers) html = urlopen(req) bsObj = BeautifulSoup(html,'lxml') pages= bsObj.findAll("a",text = '尾頁') for page in pages: if "currentnum" in page.attrs: pages_num = page.attrs["currentnum"] else: return None return pages_num def conditions(i): """獲取各個平臺的運營狀態,生成包含各類平臺的列表""" # global lists lists =[] url_ = r"""https://www.wdzj.com/dangan/search?filter=e%s"""%str(i) all_pages_num = int(pages_num(url_)) for num in range(1, all_pages_num +1): url = url_ + "&currentPage=%s"%str(num) lists = get_platform_site(url,lists) return lists operations = conditions(1) #正常運營平臺 #close_transitions = conditions(2) #停業或轉型平臺 #in_problems= conditions(3) #問題平臺 def plat_profile(lists): """抓取平臺的資料""" global domains plat_profile=[] for site in lists: plat_info =[] url = domains + site req = urllib.request.Request(url, headers=headers) html = urlopen(req) bsObj = BeautifulSoup(html,'lxml') plat_name = bsObj.findAll('h1')[0].attrs["alt"] #平臺名稱 t_l = bsObj.findAll("div",{"class":"pt-info"})[0].get_text() time_s="" location ="" if len(t_l)>0: t_l = re.split("上線",t_l) time_s = t_l[0].strip() #上線時間 location= t_l[1].strip() #平臺所屬地域 common_data = bsObj.findAll("b",{"class":"tab_common_data"}) yield0 ="" #給出變數值 duration = "" #給出變數值 for data in common_data: text = data.parent.get_text() if len(re.findall(".*月.*",text)) > 0: duration = re.findall(".*月.*",text)[0] duration = text.strip() #平均期限 if len(re.findall(".*%.*",text)) > 0: yield0 = re.findall(".*%.*",text)[0] yield0 = text.strip() #平均收益率 rates_ = bsObj.find("div",{"class":"dpxximg"}) if "data-pl" in rates_.attrs: rates = bsObj.find("div",{"class":"dpxximg"}).attrs["data-pl"] #獲取評分 plat_pro = bsObj.findAll("div",{"class":"bgbox-bt zzfwbox"}) plat_pro = BeautifulSoup(str(plat_pro),"lxml") L1 =[] L2 =[] zzzj = "" gqss = "" yhtg = "" rzjl = "" jgxh = "" ICP = "" zdtb = "" zqzr = "" tbbz = "" bzms = "" for div in plat_pro.findAll("div",{"class":"l"}): L1.append(div.get_text().strip()) for div in plat_pro.findAll("div",{"class":"r"}): L2.append(div.get_text().strip()) for slzz in L1: #獲取平臺的備案資訊 if slzz =="註冊資金": zzzj = L2[L1.index(slzz)] if slzz =="股權上市": gqss = L2[L1.index(slzz)].replace(" ","") if slzz =="銀行存管": yhtg = L2[L1.index(slzz)] if slzz =="融資記錄": rzjl = L2[L1.index(slzz)].replace(" ","") if slzz =="監管協會": jgxh = L2[L1.index(slzz)].replace(" ","") if slzz =="ICP號": ICP = L2[L1.index(slzz)] if slzz =="自動投標": zdtb = L2[L1.index(slzz)] if slzz =="債券轉讓": zqzr = L2[L1.index(slzz)] if slzz =="投標保障": tbbz = L2[L1.index(slzz)] if slzz =="保障模式": bzms = L2[L1.index(slzz)] plat_info.append(plat_name) #這個地放用了很笨的方法,一個個的新增元素到列表中,存在優化空間 plat_info.append(time_s) plat_info.append(location) plat_info.append(duration) plat_info.append(yield0) plat_info.append(rates) plat_info.append(zzzj) plat_info.append(gqss) plat_info.append(yhtg) plat_info.append(rzjl) plat_info.append(jgxh) plat_info.append(ICP) plat_info.append(zdtb) plat_info.append(zqzr) plat_info.append(tbbz) plat_info.append(bzms) plat_profile.append(plat_info) print("------------->"+plat_name+str(lists.index(site))) #列印爬取的平臺資訊 return plat_profile plat_profile = plat_profile(conditions(1)) #conditions根據平臺型別的不同來設定,為1則表示正常運營平臺 name = ['平臺名稱','上線時間','區域','投資期限','平均收益率','評分', '註冊資金', '股權上市', '銀行存管', '融資記錄', '監管協會', 'ICP號', '自動投標', '債權轉讓', '投標保障', '保障模式'] operating = pd.DataFrame(columns=name,data= plat_profile) operating.to_csv(r"""C:\Users\95647\Desktop\operating.csv""") #path to save csvfile

寫該程式碼之前,還沒有學習panda的用法,所以簡單粗暴的用列表和字典來解決問題,會使用pandas的朋友可以用pandas來進行優化。爬蟲執行過程如下:
爬蟲執行過程

該爬蟲執行速度很快,我爬取了100個正常運營平臺的資訊,才用了5分鐘左右,部分爬蟲結果如下:
爬蟲結果展示

有問題的和其他想法的朋友,歡迎加QQ:956471511交流,這裡還有一篇關於如何爬取人人貸網站資料的博文,有興趣的朋友可以看一下。手把手教你用python爬取人人貸借款人資料