專案:利用adaboost對Employee滿意度進行分類
阿新 • • 發佈:2020-12-16
專案介紹:利用adaboost對Employee滿意度進行分類
0.整理資料
從https://www.datafountain.cn/datasets/12下載IBM員工滿意度的虛擬資料,並做好員工滿意度型別標籤,並整理成txt檔案。
1.分析目的
對員工滿意度進行預測分類。
2. 分析程式碼
從實際分類效果來看,adaboost在測試集資料的錯誤率為20%,正確率約80%,測試效果還不錯。
import pandas as pd
import numpy as np
%matplotlib inline
%matplotlib notebook
import matplotlib.pyplot as plt
from numpy import *
import adaboost
#匯入訓練資料
datArr,labelArr = adaboost.loadDataSet('HR_Employee_traindata2.txt')
#建立分類器
classifierArray,_ = adaboost.adaBoostTrainDS(datArr,labelArr,10)
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
classifierArray
[{'dim': 0, 'thresh': 13.8, 'ineq': 'lt', 'alpha': 0.727883366967329 },
{'dim': 24, 'thresh': 2.4, 'ineq': 'gt', 'alpha': 0.12463342761307217},
{'dim': 10, 'thresh': 2.2, 'ineq': 'gt', 'alpha': 0.1003746703517236},
{'dim': 20, 'thresh': 3.1, 'ineq': 'lt', 'alpha': 0.0963189756522224},
{'dim': 8, 'thresh': 79.0, 'ineq': 'gt', 'alpha': 0.0949072890789229},
{'dim': 4, 'thresh': 2.2, 'ineq': 'gt', 'alpha': 0.08407359046211836},
{'dim': 3, 'thresh': 1.0, 'ineq': 'lt', 'alpha': 0.0960231041915813},
{'dim': 26, 'thresh': 6.8, 'ineq': 'lt', 'alpha': 0.0948392081206876},
{'dim': 4, 'thresh': 3.0, 'ineq': 'lt', 'alpha': 0.06227497331058203},
{'dim': 18, 'thresh': 20.799999999999997, 'ineq': 'gt', 'alpha': 0.07452043991609442}]
#匯入測試資料
testArr,testLabelArr = adaboost.loadDataSet('HR_Employee_testdata2.txt')
#進行分類
prediction10 = adaboost.adaClassify(testArr,classifierArray)
#錯誤統計
errArr=mat(ones((735,1)))
errnum = errArr[prediction10!=mat(testLabelArr).T].sum()
errnum
#150.0
#錯誤率
errnum/len(errArr)
#0.20408163265306123
3.adaboost原始碼
#coding=utf-8
from numpy import *
def loadSimpData():
datMat = matrix([[ 1. , 2.1],
[ 2. , 1.1],
[ 1.3, 1. ],
[ 1. , 1. ],
[ 2. , 1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat,classLabels
#對資料進行分類
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
retArray = ones((shape(dataMatrix)[0],1))
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0
return retArray
#找到最佳決策樹
def buildStump(dataArr,classLabels,D):
dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
minError = inf #最小錯誤率,開始初始化為無窮大
for i in range(n):#遍歷資料集所有特徵
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
stepSize = (rangeMax-rangeMin)/numSteps #考慮資料特徵,計算步長
for j in range(-1, int(numSteps) + 1): #遍歷不同步長時的情況
for inequal in ['lt', 'gt']: #大於/小於閾值 切換遍歷
threshVal = (rangeMin + float(j) * stepSize) #設定閾值
predictedVals = stumpClassify(dataMatrix, i, threshVal,inequal) #分類預測
errArr = mat(ones((m, 1)))#初始化全部為1(初始化為全部不相等)
errArr[predictedVals == labelMat] = 0#預測與label相等則為0,否則為1
# 分類器與adaBoost互動
# 權重向量×錯誤向量=計算權重誤差(加權錯誤率)
weightedError = D.T * errArr
if weightedError < minError:
minError = weightedError #儲存當前最小的錯誤率
bestClasEst = predictedVals.copy() #預測類別
#儲存該單層決策樹
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst #返回字典,錯誤率和類別估計
#完整adaboost演算法
def adaBoostTrainDS(dataArr,classLabels,numIt=40): #numIt 使用者設定的迭代次數
weakClassArr = []
m = shape(dataArr)[0]#m表示陣列行數
D = mat(ones((m,1))/m) #初始化每個資料點的權重為1/m
aggClassEst = mat(zeros((m,1)))#記錄每個資料點的類別估計累計值
for i in range(numIt):
# 建立一個單層決策樹,輸入初始權重D
bestStump,error,classEst = buildStump(dataArr,classLabels,D)
print ("D:",D.T)
# alpha表示本次輸出結果權重
alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#1e-16防止零溢位
bestStump['alpha'] = alpha #alpha加入字典
weakClassArr.append(bestStump) #字典加入列表
print ("classEst: ",classEst.T)
# 計算下次迭代的新權重D
expon = multiply(-1*alpha*mat(classLabels).T,classEst)
D = multiply(D,exp(expon))
D = D/D.sum()
# 計算累加錯誤率
aggClassEst += alpha*classEst
print ("aggClassEst: ",aggClassEst.T)
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
errorRate = aggErrors.sum()/m
print ("total error: ",errorRate)
if errorRate == 0.0: break#錯誤率為0時 停止迭代
return weakClassArr,aggClassEst
#測試adaboost
def adaClassify(datToClass,classifierArr):
dataMatrix = mat(datToClass)#待分類樣例 轉換成numpy矩陣
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m,1)))
for i in range(len(classifierArr)):#遍歷所有弱分類器
classEst = stumpClassify(dataMatrix,\
classifierArr[i]['dim'],\
classifierAr[i]['thresh'],\
classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha']*classEst
print (aggClassEst) #輸出每次迭代侯變化的結果
return sign(aggClassEst) #返回符號,大於0返回1,小於0返回-1
#在難資料集上應用
#自適應資料載入函式
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split('\t')) #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat-1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
4.百度硬碟分享
已將資料來源和程式碼打包上傳百度硬碟,若需要資料,請打賞任意金額並留下聯絡郵箱,將給予密碼。
https://pan.baidu.com/s/1gP34nFySkN8QBQm5rqb77Q