用python爬取網貸之家p2p平臺數據
阿新 • • 發佈:2019-01-09
網貸之家中的p2p平臺數據比較容易獲取,重要的就是如何分析網頁的原始碼然後從裡面提取自己需要的資訊,也不需要使用者登入,該網站的爬蟲比較簡單,主要用了urllib包來獲取網頁資訊,用BeautifulSoup來解析網頁,最後用正則表示式提取資料。這裡就直接上原始碼了:
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 8 18:22:26 2018
@author: 95647
"""
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pandas as pd
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
lists =[]
domains = "https://www.wdzj.com"
def get_platform_site(url,lists):
"""獲取所有的平臺網址"""
# global lists
req = urllib.request.Request(url, headers=headers)
html = urlopen(req)
bsObj = BeautifulSoup(html,'lxml' )
title = bsObj.findAll("div",{'class':'itemTitle'})
for titles in title:
links = titles.findAll("a",{'target':'_blank'})
for link in links:
if 'href' in link.attrs:
lists.append(link.attrs['href'])
# print(link.attrs['href'])
return lists #用utf-8進行解碼
def pages_num(url):
"""獲取各類平臺的頁面總數"""
req = urllib.request.Request(url, headers=headers)
html = urlopen(req)
bsObj = BeautifulSoup(html,'lxml')
pages= bsObj.findAll("a",text = '尾頁')
for page in pages:
if "currentnum" in page.attrs:
pages_num = page.attrs["currentnum"]
else:
return None
return pages_num
def conditions(i):
"""獲取各個平臺的運營狀態,生成包含各類平臺的列表"""
# global lists
lists =[]
url_ = r"""https://www.wdzj.com/dangan/search?filter=e%s"""%str(i)
all_pages_num = int(pages_num(url_))
for num in range(1, all_pages_num +1):
url = url_ + "¤tPage=%s"%str(num)
lists = get_platform_site(url,lists)
return lists
operations = conditions(1) #正常運營平臺
#close_transitions = conditions(2) #停業或轉型平臺
#in_problems= conditions(3) #問題平臺
def plat_profile(lists):
"""抓取平臺的資料"""
global domains
plat_profile=[]
for site in lists:
plat_info =[]
url = domains + site
req = urllib.request.Request(url, headers=headers)
html = urlopen(req)
bsObj = BeautifulSoup(html,'lxml')
plat_name = bsObj.findAll('h1')[0].attrs["alt"] #平臺名稱
t_l = bsObj.findAll("div",{"class":"pt-info"})[0].get_text()
time_s=""
location =""
if len(t_l)>0:
t_l = re.split("上線",t_l)
time_s = t_l[0].strip() #上線時間
location= t_l[1].strip() #平臺所屬地域
common_data = bsObj.findAll("b",{"class":"tab_common_data"})
yield0 ="" #給出變數值
duration = "" #給出變數值
for data in common_data:
text = data.parent.get_text()
if len(re.findall(".*月.*",text)) > 0:
duration = re.findall(".*月.*",text)[0]
duration = text.strip() #平均期限
if len(re.findall(".*%.*",text)) > 0:
yield0 = re.findall(".*%.*",text)[0]
yield0 = text.strip() #平均收益率
rates_ = bsObj.find("div",{"class":"dpxximg"})
if "data-pl" in rates_.attrs:
rates = bsObj.find("div",{"class":"dpxximg"}).attrs["data-pl"] #獲取評分
plat_pro = bsObj.findAll("div",{"class":"bgbox-bt zzfwbox"})
plat_pro = BeautifulSoup(str(plat_pro),"lxml")
L1 =[]
L2 =[]
zzzj = ""
gqss = ""
yhtg = ""
rzjl = ""
jgxh = ""
ICP = ""
zdtb = ""
zqzr = ""
tbbz = ""
bzms = ""
for div in plat_pro.findAll("div",{"class":"l"}):
L1.append(div.get_text().strip())
for div in plat_pro.findAll("div",{"class":"r"}):
L2.append(div.get_text().strip())
for slzz in L1: #獲取平臺的備案資訊
if slzz =="註冊資金":
zzzj = L2[L1.index(slzz)]
if slzz =="股權上市":
gqss = L2[L1.index(slzz)].replace(" ","")
if slzz =="銀行存管":
yhtg = L2[L1.index(slzz)]
if slzz =="融資記錄":
rzjl = L2[L1.index(slzz)].replace(" ","")
if slzz =="監管協會":
jgxh = L2[L1.index(slzz)].replace(" ","")
if slzz =="ICP號":
ICP = L2[L1.index(slzz)]
if slzz =="自動投標":
zdtb = L2[L1.index(slzz)]
if slzz =="債券轉讓":
zqzr = L2[L1.index(slzz)]
if slzz =="投標保障":
tbbz = L2[L1.index(slzz)]
if slzz =="保障模式":
bzms = L2[L1.index(slzz)]
plat_info.append(plat_name) #這個地放用了很笨的方法,一個個的新增元素到列表中,存在優化空間
plat_info.append(time_s)
plat_info.append(location)
plat_info.append(duration)
plat_info.append(yield0)
plat_info.append(rates)
plat_info.append(zzzj)
plat_info.append(gqss)
plat_info.append(yhtg)
plat_info.append(rzjl)
plat_info.append(jgxh)
plat_info.append(ICP)
plat_info.append(zdtb)
plat_info.append(zqzr)
plat_info.append(tbbz)
plat_info.append(bzms)
plat_profile.append(plat_info)
print("------------->"+plat_name+str(lists.index(site))) #列印爬取的平臺資訊
return plat_profile
plat_profile = plat_profile(conditions(1)) #conditions根據平臺型別的不同來設定,為1則表示正常運營平臺
name = ['平臺名稱','上線時間','區域','投資期限','平均收益率','評分',
'註冊資金', '股權上市', '銀行存管', '融資記錄', '監管協會',
'ICP號', '自動投標', '債權轉讓', '投標保障', '保障模式']
operating = pd.DataFrame(columns=name,data= plat_profile)
operating.to_csv(r"""C:\Users\95647\Desktop\operating.csv""") #path to save csvfile
寫該程式碼之前,還沒有學習panda的用法,所以簡單粗暴的用列表和字典來解決問題,會使用pandas的朋友可以用pandas來進行優化。爬蟲執行過程如下:
該爬蟲執行速度很快,我爬取了100個正常運營平臺的資訊,才用了5分鐘左右,部分爬蟲結果如下:
有問題的和其他想法的朋友,歡迎加QQ:956471511交流,這裡還有一篇關於如何爬取人人貸網站資料的博文,有興趣的朋友可以看一下。手把手教你用python爬取人人貸借款人資料