ML基礎---決策樹的應用
阿新 • • 發佈:2019-01-28
from sklearn.feature_extraction import DictVectorizer import csv from sklearn import tree from sklearn import preprocessing from sklearn.externals.six import StringIO # Read in the csv file and put features into list of dict and list of class label allElectronicsData = open(r'/home/zhoumiao/MachineLearning/01decisiontree/AllElectronics.csv', 'rb') reader = csv.reader(allElectronicsData) headers = reader.next() print(headers) # featureList存放特徵屬性,轉換成多維:舉例子“身高屬性”的取值“高”“矮”{1.0,0.0}, # 若沒有序關係,假定有k個屬性值,則通常轉化成k維向量, # 例如屬性”瓜類“的取值“西瓜”“南瓜”“黃瓜”可轉化為(0,0,1)(0,1,0)(1,0,0)。 featureList = [] # labelList存放標籤屬性 labelList = [] for row in reader: # 讀入標籤 labelList.append(row[len(row)-1]) # 新增字典,將屬性新增到字典中 rowDict = {} for i in range(1, len(row)-1): rowDict[headers[i]] = row[i] # 將字典存放到特徵屬性當中 featureList.append(rowDict) print(featureList) # Vetorize features vec = DictVectorizer() # python的模組,將字典轉換成上面的屬性形式 dummyX = vec.fit_transform(featureList) .toarray() print("dummyX: " + str(dummyX)) print(vec.get_feature_names()) print("labelList: " + str(labelList)) # vectorize class labels lb = preprocessing.LabelBinarizer() dummyY = lb.fit_transform(labelList) print("dummyY: " + str(dummyY)) # Using decision tree for classification # clf = tree.DecisionTreeClassifier() # sklearn的決策樹使用 clf = tree.DecisionTreeClassifier(criterion='entropy') clf = clf.fit(dummyX, dummyY) print("clf: " + str(clf)) # Visualize model # 運用graphviz將決策樹寫到文件中 with open("allElectronicInformationGainOri.dot", 'w') as f: f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f) # 去除第一行 oneRowX = dummyX[0, :] print("oneRowX: " + str(oneRowX)) # 修改第一行的資料,然後進行預測 newRowX = oneRowX newRowX[0] = 1 newRowX[2] = 0 print("newRowX: " + str(newRowX)) predictedY = clf.predict(newRowX) print("predictedY: " + str(predictedY))