1. 程式人生 > 其它 >薪資預測模型

薪資預測模型

一、選題背景:

本題通過對職員薪資資訊讀取和視覺化來展現職員薪資情況,並建立模型對薪資進行預測。

二、資料說明:

本題爬取58同城上職員資訊的部分資料

三、實施過程及程式碼:

#讀取資料
import pandas as pd
from pandas import Series
data_analysis = pd.read_csv('./data_to_be_analysed/data_analysis_with_skills.csv')
data_mining = pd.read_csv('./data_to_be_analysed/data_mining_with_skills.csv')
machine_learning 
= pd.read_csv('./data_to_be_analysed/machine_learning_with_skills.csv') business_analysis = pd.read_csv('./data_to_be_analysed/business_analysis_with_skills.csv') data_analysis.shape

#新增薪資均值

import re
# 均值函式
def average(job_salary):
    # 取薪資均值----------------
    pattern = re.compile('\d+')
    salary 
= job_salary try: res = re.findall(pattern, salary) avg_salary = 0 sum = 0 for i in res: a = int(i) sum = sum + a avg_salary = sum / 2 except Exception: avg_salary = 0 # 函式返回值 return avg_salary salary_list = []
for i in range(0,data_analysis.shape[0]): avg_sal = average(data_analysis['職位薪資'][i]) salary_list.append(avg_sal) sal = Series(salary_list) data_analysis.insert(9,'salary',sal) salary_list = [] for i in range(0,data_mining.shape[0]): avg_sal = average(data_mining['職位薪資'][i]) salary_list.append(avg_sal) sal = Series(salary_list) data_mining.insert(9,'salary',sal) salary_list = [] for i in range(0,machine_learning.shape[0]): avg_sal = average(machine_learning['職位薪資'][i]) salary_list.append(avg_sal) sal = Series(salary_list) machine_learning.insert(9,'salary',sal) salary_list = [] for i in range(0,business_analysis.shape[0]): avg_sal = average(business_analysis['職位薪資'][i]) salary_list.append(avg_sal) sal = Series(salary_list) business_analysis.insert(9,'salary',sal)
#薪資分佈探索
data_analysis.salary.describe()

ata_analysis.columns
%matplotlib inline
import matplotlib.pyplot as plt
data_analysis.salary.hist(bins=50, figsize=(8,5))

plt.show()
  • 薪資主要分佈在5k-30k之間
data_analysis[data_analysis.salary>30].shape

data_analysis[data_analysis.salary<5].shape

data_analysis = data_analysis[data_analysis['salary']<30]
data_analysis = data_analysis[data_analysis['salary']>5]
data_analysis.head(2)

data_analysis = data_analysis.drop(['Unnamed: 0','Keyword','職位描述','職位薪資'],axis=1)
data_mining = data_mining.drop(['Unnamed: 0','Keyword','職位描述','職位薪資'],axis=1)
machine_learning = machine_learning.drop(['Unnamed: 0','Keyword','職位描述','職位薪資'],axis=1)
business_analysis = business_analysis.drop(['Unnamed: 0','Keyword','職位描述','職位薪資'],axis=1)

#掌握的軟體技能對薪資的影響關係
corr_matrix = data_analysis.corr()
corr_matrix["salary"].sort_values(ascending=False)

Data Analysis的職位中,Hive,Spark,Hadoop大資料應用方面的軟體是薪資的加分項。同時,Python,SQL,SAS,Tableau,SPSS等統計分析軟體與視覺化軟體也是資料分析師區別於低薪分析專員的因素。PPT,Excel作為必須的軟體技能,對薪資變化並沒有太大的影響,甚至僅僅會Excel的職位淪落為專員,會是一個減分項。結論:在資料分析領域,擁有大資料軟體技能並且懂得Python這一程式語言的分析師的待遇較好。



corr_matrix = data_mining.corr()
corr_matrix["salary"].sort_values(ascending=False)


Data Mining的職位中,Hive,Spark,Hadoop大資料方面的軟體是薪資極大的加分項Java,C,Python等程式語言對資料探勘的工作有很大幫助因此也體現在了對薪資的正面影響上。分析結論:具備資料探勘演算法與編碼能力且具備大資料方面分析技能的資料探勘工程師的待遇較好。



corr_matrix = machine_learning.corr()
corr_matrix["salary"].sort_values(ascending=False)


Machine Learning的職位中,沒有特別突出的技能加分項,列表中的軟體技能基本都是入職必備的技能。Hive,Spark,Hadoop等大資料方面的技能會對薪資有一定程度的提升,不過影響較小。分析結論:機器學習工程師入門難度稍高,需要掌握具備的軟體技能也較多,沒有特別突出的薪資加分項。



corr_matrix = business_analysis.corr()
corr_matrix["salary"].sort_values(ascending=False)


Business Analysis的職位中,程式語言是極大的薪資加分項。如C,Python,JavaExcel,PPT,SPSS等軟體是這個職位的必備技能,因此對職位薪資沒有太大的影響。結論:在商業分析領域,擁有商業分析思維並且具有程式設計能力的分析師的待遇較好。

#準備資料
data_analysis.head(2)

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data_analysis, test_size=0.2, random_state=42)
data_train = train_set.copy()
data_test = test_set.copy()
data_train.shape

data_test.shape

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
data_analysis.head(1)

data_analysis_num = data_analysis.drop(['公司名稱','公司規模','地區','學歷要求','工作經驗','職位名稱','融資情況','salary'], axis=1)
num_attribs = list(data_analysis_num)
cat_attribs = ['公司規模','學歷要求','工作經驗']

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

data_analysis_prepared = full_pipeline.fit_transform(data_train)
data_analysis_test = full_pipeline.transform(data_test)
data_analysis_prepared[:1]

data_train.head(1)

data_analysis_labels = data_train.salary.values
test_labels = data_test.salary.values
#訓練模型線性迴歸
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(data_analysis_prepared, data_analysis_labels)

from sklearn.metrics import mean_squared_error
import numpy as np

salary_predictions = lin_reg.predict(data_analysis_prepared)
lin_mse = mean_squared_error(data_analysis_labels, salary_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

#測試集



y_test = lin_reg.predict(data_analysis_test)
y_test[:10]


test_labels[:10]

lin_mse = mean_squared_error(test_labels, y_test)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

測試集上誤差約為4.25

#變數重要性
feature_importances = grid_search.best_estimator_.feature_importances_
num_attribs = list(data_analysis_num)
cat_attribs = ['公司規模','學歷要求','工作經驗']
# 變數重要性排序
attributes = num_attribs + cat_attribs
sorted(zip(feature_importances, attributes), reverse=True)



公司規模對薪資的影響相比之下比較小。


#薪資預測


final_predictions = final_model.predict(data_analysis_test)
salary_test_series = Series(final_predictions,index=data_test.index)
data_test_prediction = data_test.copy()
data_test_prediction.insert(7,'prediction',salary_test_series)
data_test_prediction.sample(3)


#預測函式介面


data_test.head(1)


from pandas import DataFrame
#預測功能函式


def prediction_function(scale,degree,experience,v_skills):
    predict_X = {
        '公司規模':[scale],
        '學歷要求':[degree],
        '工作經驗':[experience],
        'Sql':[v_skills[0]],
        'Python':[v_skills[1]],
        'Excel':[v_skills[2]],
        'Sas':[v_skills[3]],
        'Spss':[v_skills[4]],
        'Hive':[v_skills[5]],
        'Hadoop':[v_skills[6]],
        'Ppt':[v_skills[7]],
        'Tableau':[v_skills[8]],
        'Spark':[v_skills[9]],
    }
    predict_tmp = pd.DataFrame(predict_X)
    X_predict = full_pipeline.transform(predict_tmp)
    return X_predict

#技能轉換函式


def skills_switch(skill_list):
    tmp_list = []
    skills = ['Sql','Python','Excel','Sas','Spss','Hive','Hadoop','Ppt','Tableau','Spark']
    for skill in skills:
        # 大小寫轉換
        if skill in skill_list:
            tmp_list.append(1)
        else:
            tmp_list.append(0)
    return tmp_list

#預測主函式


def predict(scale,degree,experience,v_skills):
    X_predict = prediction_function(scale,degree,experience,v_skills)
    Y_predict = final_model.predict(X_predict)
    print('預測薪資為:',Y_predict[0],'k/month')

#預測函式


#-----------設定變數
scale = '10000人以上'
degree = '本科'
experience = '1-3年'
# ------------------
# --------設定所掌握的技能(順序無關)
mastered_skills = ['Sql','Python','Excel','Spss','Ppt']
v_skills = skills_switch(mastered_skills)
# -----------------------------------
predict(scale,degree,experience,v_skills)

experiences = ['應屆生','1年以內', '1-3年','3-5年', '5-10年' ]

for exp in experiences:
    print(scale,'|',degree,'|',exp,'|',",".join(mastered_skills))
    predict(scale,degree,exp,v_skills)
    print('-'*60)
#總結

編寫過程還有許多漏洞是自己無法解決處理的的,程式碼沒有達到要求,也查不到問題所在通過這次作業才發現自己原來有這麼多不足,知識點不全面,還有很大的提升空間,只有多寫多看多用才能做到真正的掌握,有不懂的地方要及時解決虛心求教。