python爬取 拉勾網 網際網路大資料職業情況
阿新 • • 發佈:2018-12-11
- 爬取拉勾網資訊
- 資料處理
- 製圖
所需知識只有一點點(畢竟是個小白):
- requests基礎部分
- json
- pyecharts
- wordcloud
接下來開始敲程式碼了,程式碼分成了3個部分:爬取、製圖、生成詞雲
爬取部分:
首先要說明的是,拉勾網有反爬蟲,所以,requests中的頭參首和cookie應當寫全(直接在瀏覽器上F12,然後複製貼上就行) 然後就可以進行爬取了,主要程式碼如下:
import json import requests header = {"Cookie":"JSESSIONID=ABAAABAAAGFABEFB093DFBA72E00093316821E95E4971CF; user_trace_token=20180921154401-430b2b64-ca4b-411e-be98-f86172880bd3; _ga=GA1.2.231300040.1537515842; LGSID=20180921154401-1b73362c-bd72-11e8-a516-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E4%25BA%2592%25E8%2581%2594%25E7%25BD%2591%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%3Fpx%3Ddefault%26city%3D%25E4%25B8%258A%25E6%25B5%25B7; LGUID=20180921154401-1b73384e-bd72-11e8-a516-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1537515842; _gid=GA1.2.1479618188.1537515843; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1537515859; LGRID=20180921154418-259c60d0-bd72-11e8-bb56-5254005c3644; SEARCH_ID=801b32db85184bc0910c10a3da7a18ad", "Host":"www.lagou.com", 'Origin': 'https://www.lagou.com', 'Referer':'https://www.lagou.com/jobs/list_%E4%BA%92%E8%81%94%E7%BD%91%E5%A4%A7%E6%95%B0%E6%8D%AE?px=default&city=%E4%B8%8A%E6%B5%B7', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'} Data = {'first':'true','kd':'網際網路大資料','pn':'1'} def start_requests(myurl):
def start_requests(myurl): r1 = requests.post(myurl,data=Data,headers=header,verify=False) r_text = r1.text content = json.loads(r_text) pagesize = content.get('content').get('pageSize') return pagesize def get_result(pagesize): for page in range(1,pagesize+1): content_next = json.loads(requests.post(myurl +str(page),data=Data,headers=header,verify=False).text) company_info = content_next.get('content').get('positionResult').get('result') if company_info: for p in company_info: line = str(p['city']).replace(',',';') + ',' + str(p['companyFullName']).replace(',',';') + ',' + str(p['companyId']).replace(',',';') + ',' + \ str(p['companyLabelList']).replace(',',';') + ',' + str(p['companyShortName']).replace(',',';') + ',' + str(p['companySize']).replace(',',';') + ',' + \ str(p['businessZones']).replace(',',';') + ',' + str(p['firstType']).replace(',',';') + ',' + str( p['secondType']).replace(',',';') + ',' + \ str(p['education']).replace(',',';') + ',' + str(p['industryField']).replace(',',';') +',' + \ str(p['positionId']).replace(',',';')+',' + str(p['positionAdvantage']).replace(',',';') +',' + str(p['positionName']).replace(',',';') +',' + \ str(p['positionLables']).replace(',',';') +',' + str(p['salary']).replace(',',';') +',' + str(p['workYear']).replace(',',';') + '\n' file.write(line)
注:replace(‘,’,‘;’)是將爬取的資訊中的“,”改成“;”,為了方便後面製表。
if __name__=='__main__': title = 'city,companyFullName,companyId,companyLabelList,companyShortName,companySize,businessZones,firstType,secondType,education,industryField,positionId,positionAdvantage,positionName,positionLables,salary,workYear\n' file = open('%s.txt' % '爬取拉勾網', 'a') file.write(title) cityList = ['北京', '上海','深圳','廣州','杭州','成都','南京','武漢','西安','廈門','長沙','蘇州','天津','鄭州'] for city in cityList: print('爬取%s'%city) myurl ='https://www.lagou.com/jobs/positionAjax.json?px=default&city={}&needAddtionalResult=false&pn=1'.format(city) pagesize = start_requests(myurl) # print(pagesize) get_result(pagesize) file.close()
然後就可以獲得爬取的檔案了 到這裡,第一步就完成了。
製圖部分:
爬取到的檔案,薪水是一個區間,不方便處理,所以要進行求平均。
def cut_word(word,method):
position=word.find('-')
length=len(word)
if position !=-1:
bottomsalary=word[:position-1]
topsalary=word[position+1:length-1]
else:
bottomsalary=word[:word.upper().find('K')]
topsalary=bottomsalary
if method=="bottom":
return bottomsalary
else:
return topsalary
df_duplicates['topsalary']=df_duplicates.salary.apply(cut_word, method="top")
df_duplicates['bottomsalary']=df_duplicates.salary.apply(cut_word, method="bottom")
df_duplicates.bottomsalary.astype('int')
df_duplicates.topsalary.astype('int')
df_duplicates["avgsalary"]=df_duplicates.apply(lambda x:(int(x.bottomsalary)+int(x.topsalary))/2,axis=1)
然後,就可以根據資料畫圖了。 注:要先把檔案格式轉化為csv,可方便後續的操作: (之前將“,”改成“;”就是為了方便這裡操作) 然後就生成了表格
然後利用pyecharts模組畫圖就行了,程式碼如下:
import pandas as pd
from pyecharts import Bar
from pyecharts import Page
from pyecharts import Pie
df = pd.read_csv('/home/lsgo28/PycharmProjects/demo/爬取拉勾網.csv')
df_duplicates=df.drop_duplicates(subset='positionId',keep='first')
city_list = df_duplicates["city"].drop_duplicates(keep='first')
money = []
money1 = []
number = []
number1 = []
k = 0
l = 0
m = 0
n = 0
for city in city_list:
for city_,money_ in zip(df_duplicates["city"],df_duplicates["avgsalary"]):
if city_==city:
k +=money_
l += 1
number.append(l)
money.append(k/l)
workyears_list = ['應屆畢業生','1年以下','1-3年','3-5年','5-10年','10年以上','不限']
for workyears in workyears_list:
for workyears_,money_ in zip(df_duplicates["workYear"],df_duplicates["avgsalary"]):
if workyears_==workyears:
m += money_
n += 1
number1.append(n)
money1.append(m/n)
pie = Pie("職位數量餅圖")
bar = Bar("各大城市職業的平均月薪(K)")
bar1 = Bar("各大城市公司數")
bar2 = Bar("各類工作經驗平均月薪(K)")
pie.add("",workyears_list,number1,is_stack=True)
bar.add("月薪",city_list,money,is_stack=True)
bar1.add("公司",city_list,number,is_stack=True)
bar2.add("經驗",workyears_list,number1,is_stack=True)
page = Page()
page.add(bar)
page.add(bar1)
page.add(bar2)
page.add(pie)
page.render("圖文.html")
這樣,想要的圖片就畫完了 到這裡,第二步就完成了
畫詞雲
想畫詞雲,首先得有一個儲存詞語的txt檔案,我們就將爬取的檔案中positionLables列中的詞語繪製一個詞雲 顯然,一個單元格中存有不止一個的詞語,這樣就很難生成一個詞雲。 你可以通過jieba分詞包對該檔案進行分詞(有點麻煩),我這裡用了一個取巧的辦法:將該列複製,然後開啟一個新的csv檔案,貼上進去,然後這樣改: 就把【】‘’去掉了,而且也達到了分詞的效果, 然後把csv格式轉化成txt就行了。 然後選取一個喜歡的背景圖片: 接著就是畫詞雲了,程式碼如下:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
bg = np.array(Image.open("12.jpg"))
with open('/home/lsgo28/PycharmProjects/demo/ciyun.txt','r') as f:
fl = f.read()
fl = fl.replace(',','\n')
wc = WordCloud(background_color="white",
max_words=200,
mask=bg,
max_font_size=60,
random_state=42,
font_path='/home/lsgo28/PycharmProjects/demo/ziti.ttf').generate(fl)
image_color = ImageColorGenerator(bg)
plt.imshow(wc.recolor(color_func=image_color))
wc.to_file("ciyun.png")
大功告成