2018-12月**網java、python、c/c++,php招聘分析
阿新 • • 發佈:2018-12-28
2018年即將結束,java、python、c/c++,php四種語言在北京,上海,廣州,深圳四個一線城市的招聘資訊分析,資料樣本來自前30頁的資料,樣本大小大概6058個。
1,資料抓取
非常簡單,基本上沒有發抓取策略
def downloader(city, keyword, page): ''' :param city: :param keyword: :param page: :return: ''' url = "https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false" \ .format(quote(city)) data = { "first": "false", "pn": page, "kd": keyword } headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Content-Length": "26", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Host": "www.lagou.com", "Cookie": "WEBTJ-ID=20181228093856-167f276e34849d-015bd2bf49274b-6114147a-1327104-167f276e34a334; _ga=GA1.2.651225173.1545961137; _gid=GA1.2.952777220.1545961137; user_trace_token=20181228093740-29e0dba1-0a41-11e9-b14d-525400f775ce; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xdc8f964d00002f4f%26issp%3D1%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26oq%3D%2525E4%2525B8%252593%2525E8%2525B5%252584%2525E5%25258A%25259E%26rsv_t%3Df7a1d2gJnPyNK%252FsS4vTWJ9EOKhzAsK05aVgqC43iWtqWmiKpIp0u6YQblMkUzbi3KwO7%26inputT%3D8441%26rsv_pq%3D9f44c2a800002af6%26rsv_sug3%3D57%26rsv_sug1%3D62%26rsv_sug7%3D101%26bs%3D%25E4%25B8%2593%25E8%25B5%2584%25E5%258A%259E; LGUID=20181228093740-29e0e252-0a41-11e9-b14d-525400f775ce; LGSID=20181228093745-2cd1a71c-0a41-11e9-b14d-525400f775ce; PRE_UTM=m_cf_cpc_baidu_pc; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpc_baidu_pc%26m_kw%3Dbaidu_cpc_bj_e110f9_d2162e_%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591; JSESSIONID=ABAAABAAAGGABCB3EDF3AFE52B111A35A8BDCCF214C647F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545961137,1545961142,1545961149; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_search; SEARCH_ID=832387387eb944a39636c9973cbd41c4; LGRID=20181228093800-3605ba8a-0a41-11e9-ad84-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545961158", "Origin": "https://www.lagou.com", "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "X-Anit-Forge-Code": "0", "X-Anit-Forge-Token": "None", "X-Requested-With": "XMLHttpRequest" } proxies = { "http": "****", "https": "****", } while True: try: response = requests.post(url, data=data, headers=headers, proxies=proxies) response.encoding = "utf-8" if response.status_code == 200: data = json.loads(response.text) result = jsonpath.jsonpath(data, "$.content.positionResult.result")[0] with MongodbTools("dataanalysis") as mongo: lagou = mongo.db["lagou"] for row in result: row["_id"] = "{}".format(row["positionId"]) lagou.update_one({"_id": row["_id"]}, {"$set": row}, upsert=True) print("update or insert data = {}".format(row["_id"])) break except BaseException as e: print(e) pass
直接儲存資料到mongodb中。
2,資料分析
1)資料清洗,格式化
import pandas as pd import numpy as np from matplotlib import pyplot as plt from datetime import datetime,timedelta from pymongo import MongoClient import time mongo = MongoClient()["dataanalysis"]["lagou"] values = mongo.find({},{"_id":0,"positionAdvantage":1,"salary":1,"city":1,"positionName":1,"workYear":1,"education":1,"industryField":1,"companySize":1,"financeStage":1,"firstType":1,"secondType":1,"thirdType":1}) values = [row for row in values] df = pd.DataFrame(values) # 格式化公司規模 def length(data,type): value = data.values if not value: return 0 value = value[0] if not value: return 0 if value.find("以上") != -1: if type == 1: return 2000 else: return 10000 elif value.find("-") != -1: t = value.replace("人","").split("-") if type == 1: return int(t[0]) else: return int(t[1]) else: if type == 1: return 0 else: return 15 def min_staff(data): return length(data,1) def max_staff(data): return length(data,2) df["min_staff"] = df[["companySize"]].apply(min_staff,axis=1) df["max_staff"] = df[["companySize"]].apply(max_staff,axis=1) df = df.drop(["companySize"],axis=1) # 格式化薪資 def salary(data,type): value = data.values if not value: return 0 value = value[0] if not value: return 0 if value.find("-") != -1: t = value.replace("k","").replace("K","").split("-") if type == 1: return int(t[0])*1000 elif type == 2: return int(t[1])*1000 else: return (int(t[0])*1000+int(t[1])*1000)/2 else: return 0 def min_salary(data): return salary(data,1) def max_salary(data): return salary(data,2) def avg_salary(data): return salary(data,3) df["min_salary"] = df[["salary"]].apply(min_salary,axis=1) df["max_salary"] = df[["salary"]].apply(max_salary,axis=1) df["avg_salary"] = df[["salary"]].apply(avg_salary,axis=1) # 格式化語言 def language(data): value = data.values if not value: return None value = value[0] if not value: return None value = value.upper() if value.find("PYTHON") != -1: return "python" if value.find("C++") != -1: return "c/c++" if value.find("C") != -1: return "c/c++" if value.find("JAVA") != -1: return "java" if value.find("PHP") != -1: return "php" return None df["language"] = df[["positionName"]].apply(language,axis=1) df = df.dropna()
把薪資,語言,公司規模進行格式化資料,刪除為Nan的資料。
2)每個城市地區的平均工資圖
total_x = None total_y = [] total_city = [] for city_name,data in df.groupby(by="city"): result = data.groupby(by=["language"])["avg_salary"].mean().sort_index() plt.figure(figsize=(20,8),dpi=80) _x = result.index _y = result.values plt.bar(_x,_y) total_x = _x total_y.append(_y) total_city.append(city_name) plt.xlabel("語言") plt.ylabel("平均薪資") plt.title("{}地區程式語言平均薪資".format(city_name)) plt.grid() plt
3)平均薪資城市之間的對比
plt.figure(figsize=(20,8),dpi=80)
interval = 6
ind = np.array(range(0,len(total_x) * interval,interval))
width = 1
for index in range(len(total_city)):
plt.bar(ind - (2 - index) * width + width/2,total_y[index],label=total_city[index],width=1)
plt.xticks(range(0,len(total_x) * interval,interval),total_x)
plt.xlabel("語言")
plt.ylabel("平均薪資")
plt.title("一線城市程式語言平均薪資")
plt.grid()
plt.legend()
plt
可見大帝都的平均工資最高(底層碼農啊,大哭。。。。。。)
4)崗位優勢的分析
import re
def position_advantage(data):
value = data.values
if not value:
return []
value = value[0]
if not value:
return []
value = re.sub(r"[.~]","",value)
return re.split(r'[,,; ;、+-]',value)
labels = list(set([i for row in df[["positionAdvantage"]].apply(position_advantage,axis=1).values for i in row if i]))
position_data = pd.DataFrame(np.zeros((df.shape[0],len(labels))).astype(int),columns=labels,index=df.index)
for label in labels:
position_data[label][df["positionAdvantage"].str.contains(label)] = 1
result = position_data.sum().sort_values(ascending=False)
size = result[:10].values
size = [row for row in size]
labels = result[:10].index
labels = [row for row in labels]
size.append(result.sum() - sum(size))
labels.append("其它")
explode = [0 for i in range(len(size))]
explode[0] = 0.1
plt.figure(figsize=(10,10),dpi=80)
plt.pie(size, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
plt.title("崗位優勢百分比")
plt
5)城市之間的崗位優勢對比
total_value = []
total_label = []
labels = [row for row in result[:10].index]
for index in range(len(total_city)):
city = total_city[index]
data = position_data[df["city"] == city]
total_size = data.sum().sum()
total_label.append(city)
total_value.append((data[labels].sum()/total_size*10000).values.tolist())
plt.figure(figsize=(20,8),dpi=80)
interval = 8
ind = np.array(range(0,len(labels) * interval,interval))
width = 1
for index in range(len(total_label)):
plt.bar(ind - (2 - index) * width + width/2,total_value[index],label=total_label[index],width=1)
plt.xticks(range(0,len(labels) * interval,interval),labels)
plt.xlabel("福利")
plt.ylabel("佔比(*100)")
plt.title("崗位優勢佔比圖")
plt.grid()
plt.legend()
plt
6)工作經驗要求佔比分析
#工作經驗要求佔比
for city_name,data in df.groupby(by="city"):
result = data.groupby(by=["workYear"])["avg_salary"].count().sort_values()
plt.figure(figsize=(8,8),dpi=80)
_x = result.index
_y = result.values
plt.pie(_y, labels=_x, autopct='%1.1f%%',shadow=True, startangle=90)
plt.title("{}地區程式語言學歷要求佔比".format(city_name))
plt.grid()
plt
7)學歷經驗要求佔比分析
#學歷要求佔比
for city_name,data in df.groupby(by="city"):
result = data.groupby(by=["education"])["avg_salary"].count().sort_index()
plt.figure(figsize=(8,8),dpi=80)
_x = result.index
_y = result.values
plt.pie(_y, labels=_x, autopct='%1.1f%%',shadow=True, startangle=90)
plt.title("{}地區程式語言學歷要求佔比".format(city_name))
plt.grid()
plt
8)繪製崗位優勢的詞雲圖
# 生成詞圖
from scipy.misc import imread
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
import matplotlib.pyplot as plt
from os import path
cloud = WordCloud(
#設定字型,不指定就會出現亂碼,檔名不支援中文
font_path="C:/simfang.ttf",
#font_path=path.join(d,'simsun.ttc'),
#設定背景色,預設為黑,可根據需要自定義為顏色
background_color='black',
#詞雲形狀,
#mask=color_mask,
#允許最大詞彙
max_words=400,
#最大號字型,如果不指定則為影象高度
max_font_size=100,
#畫布寬度和高度,如果設定了msak則不會生效
width=1200,
height = 800,
margin = 2,
#詞語水平擺放的頻率,預設為0.9.即豎直襬放的頻率為0.1
prefer_horizontal = 0.8
)
result = position_data.sum().sort_values(ascending=False)
_labels = [row for row in result.index]
_frequency = [row for row in result.values]
_data = { _labels[index]:_frequency[index] for index in range(len(_labels))}
wc = cloud.generate_from_frequencies(_data)
wc.to_file("cloud.jpg") #儲存圖片
#顯示詞雲圖片
plt.imshow(wc)
#不現實座標軸
plt.axis('off')
plt