決策樹入門程序,各部分配有解析
阿新 • • 發佈:2018-09-11
ade rom 說明 處理方式 sda blog six ring rap
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import tree
from sklearn import preprocessing
from sklearn.externals.six import StringIO
#Read in the csv file and put features into list of dict and list of class label
allElectronicsData = open(r‘AllElectronics.csv‘, ‘rt‘)
reader = csv.reader(allElectronicsData)
headers = next(reader)
print(headers)
featureList = []
labelList = []
for row in reader:
# 把所有的結果放到這裏,相當於
labelList.append(row[len(row)-1])
#存x值,以鍵值對的形式,鍵值從headers裏面取,屬性值從每行數據裏面取
rowDict = {}
for i in range(1, len(row)-1):
rowDict[headers[i]] = row[i]
featureList.append(rowDict)
print(featureList)
#Vetorize features
#0-1化
#說明:DictVectorizer的處理對象是符號化(非數字化)的但是具有一定結構的特征數據,如字典等,將符號轉成數字0/1表示。
#我們不難發現,DictVectorizer對非數字化的處理方式是,借助原特征的名稱,組合成新的特征,並采用0/1的方式進行量化,
#而數值型的特征轉化比較方便,一般情況維持原值即可。
vec = DictVectorizer()
#fit_transform():先擬合數據再標準化
#transform():標準化
dummyX = vec.fit_transform(featureList) .toarray()
print("dummyX: " + str(dummyX))
print(vec.get_feature_names())
print("labelList: " + str(labelList))
#vectorize class labels
#標簽二值化:sklearn.preprocessing.LabelBinarizer(neg_label=0, pos_label=1,sparse_output=False)
#主要是將多類標簽轉化為二值標簽,最終返回的是一個二值數組或稀疏矩陣
#參數說明:
#neg_label:輸出消極標簽值
#pos_label:輸出積極標簽值
#sparse_output:設置True時,以行壓縮格式稀疏矩陣返回,否則返回數組
#classes_屬性:類標簽的取值組成數組
#①設置neg_label=2、pos_label=4,只能返回二值數組,理解neg_label、pos_label兩標簽值的含義
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print("dummyY: " + str(dummyY))
#Using decision tree for classification
#clf = tree.DecisionTreeClassifier()
#決策樹分類器
clf = tree.DecisionTreeClassifier(criterion=‘entropy‘)
clf = clf.fit(dummyX, dummyY)
print("clf: " + str(clf))
#Visualize model
with open("allElectronicInformationGainOri.dot", ‘w‘) as f:
f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
oneRowX = dummyX[0, :]
print("oneRowX: " + str(oneRowX))
newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
print("newRowX: " + str(newRowX))
predictedY = clf.predict(newRowX.reshape(1, -1))
print("predictedY: " + str(predictedY))
數據樣本:
決策樹入門程序,各部分配有解析