基於決策樹預測隱形眼鏡型別
阿新 • • 發佈:2018-12-20
隱形眼鏡資料集是著名的資料集,它包含很多患者眼部狀況的觀察條件以及醫生推薦的隱形眼鏡型別。隱形眼鏡的型別包括硬材質、軟材質以及不適合佩戴隱形眼鏡。資料集如下圖所示,第一列代表年齡‘age’,第二列代表醫生的建議‘prescript’,第三列代表是否散光‘astigmatic’,第四列代表戴眼鏡的頻率‘tearRate’。
1.匯入資料集,將資料集轉換到列表中
fr = open('lenses.txt') lenses = [line.strip().split('\t') for line in fr.readlines()] lensesLabels = ['age','prescript','astigmatic','tearRate'] lenses 執行結果: [['young', 'myope', 'no', 'reduced', 'no lenses'], ['young', 'myope', 'no', 'normal', 'soft'], ['young', 'myope', 'yes', 'reduced', 'no lenses'], ['young', 'myope', 'yes', 'normal', 'hard'], ['young', 'hyper', 'no', 'reduced', 'no lenses'], ['young', 'hyper', 'no', 'normal', 'soft'], ['young', 'hyper', 'yes', 'reduced', 'no lenses'], ['young', 'hyper', 'yes', 'normal', 'hard'], ['pre', 'myope', 'no', 'reduced', 'no lenses'], ['pre', 'myope', 'no', 'normal', 'soft'], ['pre', 'myope', 'yes', 'reduced', 'no lenses'], ['pre', 'myope', 'yes', 'normal', 'hard'], ['pre', 'hyper', 'no', 'reduced', 'no lenses'], ['pre', 'hyper', 'no', 'normal', 'soft'], ['pre', 'hyper', 'yes', 'reduced', 'no lenses'], ['pre', 'hyper', 'yes', 'normal', 'no lenses'], ['presbyopic', 'myope', 'no', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'no', 'normal', 'no lenses'], ['presbyopic', 'myope', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'yes', 'normal', 'hard'], ['presbyopic', 'hyper', 'no', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'no', 'normal', 'soft'], ['presbyopic', 'hyper', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'yes', 'normal', 'no lenses']]
2.計算原始資料夏農熵
#計算原始資料的夏農熵 import numpy as np import math from math import log def shannonEntropy(dataSet): num = len(dataSet) classCount = {} for a in dataSet: label = a[-1]#最後一列為類別標籤 classCount[label] = classCount.get(label,0)+1 shangnon = 0.0 for key in classCount: prob = float(classCount[key])/num shangnon += -prob*log(prob,2)#夏農熵計算公式 return shangnon shannonEntropy(lenses) 執行結果:1.3260875253642983
3.劃分資料集
#劃分資料集 def splitDataSet(dataSet,feature_index,feature_value): subDataSet = [] for b in dataSet: if b[feature_index]==feature_value: temp = b[:feature_index]#注意這裡不能直接用del刪除而應該用切片,用del原資料集會改變 temp.extend(b[feature_index+1:]) subDataSet.append(temp) return subDataSet
4.選擇根節點
#選擇根節點
def selectRootNode(dataSet):
baseEntropy = shannonEntropy(dataSet)#計算原始夏農熵
numFeatures = len(dataSet[0])-1#特徵個數
maxInfoGain = 0.0;bestFeature = 0
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqVals = set(featList)
newEntropy = 0.0
for j in uniqVals:
subDataSet = splitDataSet(dataSet,i,j)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * shannonEntropy(subDataSet)
infoGain = baseEntropy - newEntropy#資訊增益
if(infoGain>maxInfoGain):
maxInfoGain = infoGain
bestFeature = i
return bestFeature
5.構建樹結構
#選擇根節點
def selectRootNode(dataSet):
baseEntropy = shannonEntropy(dataSet)#計算原始夏農熵
numFeatures = len(dataSet[0])-1#特徵個數
maxInfoGain = 0.0;bestFeature = 0
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqVals = set(featList)
newEntropy = 0.0
for j in uniqVals:
subDataSet = splitDataSet(dataSet,i,j)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * shannonEntropy(subDataSet)
infoGain = baseEntropy - newEntropy#資訊增益
if(infoGain>maxInfoGain):
maxInfoGain = infoGain
bestFeature = i
return bestFeature
lensesLabels = ['age', 'prescript', 'astigmatic','tearRate']
myTree = createTree(lenses,lensesLabels)
myTree
執行結果:{'tearRate': {'normal': {'astigmatic': {'no': {'age': {'young': 'soft',
'pre': 'soft',
'presbyopic': {'prescript': {'hyper': 'soft', 'myope': 'no lenses'}}}},
'yes': {'prescript': {'hyper': {'age': {'young': 'hard',
'pre': 'no lenses',
'presbyopic': 'no lenses'}},
'myope': 'hard'}}}},
'reduced': 'no lenses'}}
6.使用樹結構執行分類
def classifier(myTree,featLabels,testVec):
firstFeat = list(myTree.keys())[0]
secondDict = myTree[firstFeat]
featIndex = featLabels.index(firstFeat)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict':
classLabel = classifier(secondDict[key],featLabels,testVec)
else:classLabel = secondDict[key]
return classLabel
classifier(myTree, ['age','prescript','astigmatic','tearRate'],['young','myope','yes','normal'])
執行結果:'hard'
7.畫樹形圖,這裡用Graphviz和pydotplus畫,資料集需要為數字
#將屬性用數字代表,'young'=0,'pre'=1,'presbyopic=2';'myope=0','hyper=1';'no'=0,'yes'=1;'reduced'=0,'normal'=1
a = np.array([0 if line[0]=='young' else 1 if line[0]=='pre' else 2 for line in lenses])
b = np.array([0 if line[1]=='myope' else 1 for line in lenses])
c = np.array([0 if line[2]=='no' else 1 for line in lenses])
d = np.array([0 if line[3]=='reduced' else 1 for line in lenses])
e = [a,b,c,d]
data = np.array(e).T
data
執行結果:
array([[0, 0, 0, 0],
[0, 0, 0, 1],
[0, 0, 1, 0],
[0, 0, 1, 1],
[0, 1, 0, 0],
[0, 1, 0, 1],
[0, 1, 1, 0],
[0, 1, 1, 1],
[1, 0, 0, 0],
[1, 0, 0, 1],
[1, 0, 1, 0],
[1, 0, 1, 1],
[1, 1, 0, 0],
[1, 1, 0, 1],
[1, 1, 1, 0],
[1, 1, 1, 1],
[2, 0, 0, 0],
[2, 0, 0, 1],
[2, 0, 1, 0],
[2, 0, 1, 1],
[2, 1, 0, 0],
[2, 1, 0, 1],
[2, 1, 1, 0],
[2, 1, 1, 1]])
#畫樹形圖
from sklearn import tree
clf = tree.DecisionTreeClassifier()
target =np.array([line[-1] for line in lenses])
clf = clf.fit(data,target)
import pydotplus
dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("lenses.pdf")