決策樹分類鳶尾花資料demo

阿新 • • 發佈：2019-01-07

code:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pydotplus

if __name__ == "__main__":
   
	iris_feature_E = "sepal lenght", "sepal width", "petal length", "petal width"
	iris_feature = "the length of sepal", "the width of sepal", "the length of petal", "the width of petal"
	iris_class = "Iris-setosa", "Iris-versicolor", "Iris-virginica"
	
	data = pd.read_csv("iris.data", header=None)
	iris_types = data[4].unique()
	for i, type in enumerate(iris_types):
		data.set_value(data[4] == type, 4, i)
	x, y = np.split(data.values, (4,), axis=1)
	x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=1)
	print(y_test)

	model = DecisionTreeClassifier(criterion='entropy', max_depth=6)
	model = model.fit(x_train, y_train)
	y_test_hat = model.predict(x_test)
	with open('iris.dot', 'w') as f:
		tree.export_graphviz(model, out_file=f)
	dot_data = tree.export_graphviz(model, out_file=None, feature_names=iris_feature_E, class_names=iris_class,
		filled=True, rounded=True, special_characters=True)
	graph = pydotplus.graph_from_dot_data(dot_data)
	graph.write_pdf('iris.pdf')
	f = open('iris.png', 'wb')
	f.write(graph.create_png())
	f.close()

	# 畫圖
	# 橫縱各取樣多少個值
	N, M = 50, 50
	# 第0列的範圍
	x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
	# 第1列的範圍
	x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
	t1 = np.linspace(x1_min, x1_max, N)
	t2 = np.linspace(x2_min, x2_max, M)
	# 生成網格取樣點
	x1, x2 = np.meshgrid(t1, t2)
    # # 無意義，只是為了湊另外兩個維度
    # # 開啟該註釋前，確保註釋掉x = x[:, :2]
	x3 = np.ones(x1.size) * np.average(x[:, 2])
	x4 = np.ones(x1.size) * np.average(x[:, 3])
	# 測試點
	x_show = np.stack((x1.flat, x2.flat, x3, x4), axis=1)
	print("x_show_shape:\n", x_show.shape)

	cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
	cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
	# 預測值
	y_show_hat = model.predict(x_show)
	print(y_show_hat.shape)
	print(y_show_hat)
	# 使之與輸入的形狀相同
	y_show_hat = y_show_hat.reshape(x1.shape)
	print(y_show_hat)
	plt.figure(figsize=(15, 15), facecolor='w')
	# 預測值的顯示
	plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)
	print(y_test)
	print(y_test.ravel())
	# 測試資料
	plt.scatter(x_test[:, 0], x_test[:, 1], c=np.squeeze(y_test), edgecolors='k', s=120, cmap=cm_dark, marker='*')
	# 全部資料
	plt.scatter(x[:, 0], x[:, 1], c=np.squeeze(y), edgecolors='k', s=40, cmap=cm_dark)
	plt.xlabel(iris_feature[0], fontsize=15)
	plt.ylabel(iris_feature[1], fontsize=15)
	plt.xlim(x1_min, x1_max)
	plt.ylim(x2_min, x2_max)
	plt.grid(True)
	plt.title('yuanwei flowers regressiong with DecisionTree', fontsize=17)
	plt.show()

	# 訓練集上的預測結果
	y_test = y_test.reshape(-1)
	print(y_test_hat)
	print(y_test)
	# True則預測正確，False則預測錯誤
	result = (y_test_hat == y_test)
	acc = np.mean(result)
	print('accuracy: %.2f%%' % (100 * acc))

    # 過擬合：錯誤率
	depth = np.arange(1, 15)
	err_list = []
	for d in depth:
		clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
		clf = clf.fit(x_train, y_train)
		# 測試資料
		y_test_hat = clf.predict(x_test)
		# True則預測正確，False則預測錯誤
		result = (y_test_hat == y_test)
		err = 1 - np.mean(result)
		err_list.append(err)
		print(d, 'error ratio: %.2f%%' % (100 * err))
	plt.figure(figsize=(15, 15), facecolor='w')
	plt.plot(depth, err_list, 'ro-', lw=2)
	plt.xlabel('DecisionTree Depth', fontsize=15)
	plt.ylabel('error ratio', fontsize=15)
	plt.title('DecisionTree Depth and Overfit', fontsize=17)
	plt.grid(True)
	plt.show()

生成的圖檔案：

鳶尾花的資料特徵一共有四種：花萼長度、花萼寬度，花瓣長度，花瓣寬度。然後再使用決策樹兩兩特徵進行分類：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pydotplus

if __name__ == "__main__":
   
	iris_feature_E = "sepal lenght", "sepal width", "petal length", "petal width"
	iris_feature = "the length of sepal", "the width of sepal", "the length of petal", "the width of petal"
	iris_class = "Iris-setosa", "Iris-versicolor", "Iris-virginica"
	
	data = pd.read_csv("iris.data", header=None)
	iris_types = data[4].unique()
	for i, type in enumerate(iris_types):
		data.set_value(data[4] == type, 4, i)
	x_train, y = np.split(data.values, (4,), axis=1)

	feature_pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
	plt.figure(figsize=(15, 15), facecolor='w')
	for i, pair in enumerate(feature_pairs):
		# 準備資料
		x = x_train[:, pair]
		# 決策樹進行學習
		clf = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
		dt_clf = clf.fit(x, y)
		# 開始畫圖
		N, M = 500, 500
		# 第0列的範圍
		x1_min, x1_max = x[:, 0].min(), x[:, 0].max()   
    	# 第1列的範圍
		x2_min, x2_max = x[:, 1].min(), x[:, 1].max()   
		t1 = np.linspace(x1_min, x1_max, N)
		t2 = np.linspace(x2_min, x2_max, M)
    	# 生成網格取樣點
		x1, x2 = np.meshgrid(t1, t2)           
    	# 測試點         
		x_test = np.stack((x1.flat, x2.flat), axis=1)
		# 在訓練集上預測結果
		y_hat = dt_clf.predict(x)
		y = y.reshape(-1)
		# 統計預測正確的個數
		c = np.count_nonzero(y_hat == y)
		print("y_hat:\n", y_hat)
		print("y:\n", y)
		'''
		set1 = set(y_hat)
		set2 = set(y)
		print(list(set1 & set2))
		if y_hat.any() != y.any():
			print('predict:%.3f   real:%.3f' %(y_hat.all(), y.all()))
		'''
		# 列印相關資訊
		print('features:\t', iris_feature[pair[0]], ' + ', iris_feature[pair[1]])
		print('the number of true prediction:', c)
		print('acc:%.2f%%' %(100 * float(c) / float(len(y))))

		# 畫圖顯示
		cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
		cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
		# 預測值
		y_test_hat = dt_clf.predict(x_test)
		# reshape到和輸入的x1相同格式
		y_test_hat = y_test_hat.reshape(x1.shape)
		plt.subplot(2, 3, i+1)
		plt.pcolormesh(x1, x2, y_test_hat, cmap=cm_light)
		plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', cmap=cm_dark)
		plt.xlabel(iris_feature[pair[0]], fontsize=14)
		plt.ylabel(iris_feature[pair[1]], fontsize=14)
		plt.xlim(x1_min, x1_max)
		plt.ylim(x2_min, x2_max)
		plt.grid()
	plt.suptitle('the result of yuanwei flowers in each two features with dcisiontree', fontsize=20)
	plt.tight_layout(2)
	plt.subplots_adjust(top=0.92)
	plt.show()

顯然第二種組合效果還可以的。

接著我們使用隨機森林演算法來分類看看效果：

只需要在上面的程式碼中修改：

# 決策樹進行學習
clf = DecisionTreeRegressor(n_estimators=200, criterion='entropy', max_depth=6)

為：

# 決策樹進行學習
clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=6)

效果：

看得出來隨機森林的分類要比決策樹好，隨機森林因為是根據多個決策樹弱分類器聯合成一個強分類器，所以其邊界出呈現很多的鋸齒，分類的準確度也提高很多,150個數據，最後只有一個分錯。

決策樹分類鳶尾花資料demo

code:import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib as mpl from sklearn import tree from sklearn

決策樹分類鳶尾花資料集

import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeClassifier iris_

[Java][機器學習]用決策樹分類演算法對Iris花資料集進行處理

Iris Data Set是很經典的一個數據集，在很多地方都能看到，一般用於教學分類演算法。這個資料集在UCI Machine Learning Repository裡可以找到（還是下載量排第一的資料喲）。這個資料集裡面，每個資料都包含4個值(sepal len

利用隨機森林和梯度替身決策樹對titanic資料進行分類，並對結果進行分析

import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.feature_extraction import DictVectorizer from skle

資料探勘——決策樹分類

決策樹分類是資料探勘中分類分析的一種演算法。顧名思義，決策樹是基於“樹”結構來進行決策的，是人類在面臨決策問題時一種很自然的處理機制。例如下圖一個簡單的判別買不買電腦的決策樹： &nbs

統計學習方法五決策樹分類

回歸 element row tps 樣本 pan 類別表示 splay 決策樹分類 1，概念　　　　　　　　 2，決策樹算法 2.1，特征選擇：　　熵：值越大，不確定性因素越大；條件熵：條件對結果的影響不確定性；信息增益；信息增益比　　　　　　　　　　　　　　

機器學習之路: python 決策樹分類預測泰坦尼克號乘客是否幸存

現象 info n) 指標 ssi 直觀 learn 保持 afr 使用python3 學習了決策樹分類器的api 涉及到特征的提取，數據類型保留，分類類型抽取出來新的類型需要網上下載數據集，我把他們下載到了本地，可以到我的git下載代碼和數據集: https

R語言學習(三)——決策樹分類

分類分類（Classification）任務就是通過學習獲得一個目標函式（Target Function）f, 將每個屬性集x對映到一個預先定義好的類標號y。分類任務的輸入資料是記錄的集合，每條記錄也稱為例項或者樣例。用元組(X,y)表示，其中，X 是屬性集合，y是一個特殊的

sklearn的快速使用之六（決策樹分類）

print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClas

Pyhton實現決策樹演算法 MNIST資料集

Pyhton實現決策樹演算法 MNIST資料集決策樹是一種比較接近人類思維方式的演算法，將樣本通過每個特徵值的資訊增益進行劃分，從而保證每個劃分之後的結果資訊熵的消減量達到最大。具體的原理請大家自己查詢相關資料。 sklearn實現程式碼如下, 準確率可以達到90%左右。 fr

Logistics迴歸分類鳶尾花資料集

import numpy as np from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt import matplotlib as mpl import pandas as pd fr

sklearn學習筆記之決策樹分類和線性迴歸

decisoin tree： # -*- coding: utf-8 -*- import sklearn from sklearn import tree import matplotlib.pyplot as plt from sklearn.model_selection impor

Python實現DescionTree決策樹 --- 選擇切分資料集的最佳特徵

wechat:812716131 ------------------------------------------------------ 技術交流群請聯絡上面wechat ----------------------------------------------

決策樹分類器演算法實現

# -*- coding: cp936 -*- #決策樹分類器 my_data=[['slashdot','USA','yes',18,'None'],['google','France','yes',23,'Premium'], ['digg','USA

機器學習演算法（二）——決策樹分類演算法及R語言實現方法

決策樹演算法是分類演算法中最常用的演算法之一。決策樹是一種類似流程圖的樹形結構，可以處理高維資料，直觀易理解，且準確率較高，因此應用廣泛。本篇小博就決策樹的若干演算法：ID3演算法、C4.5演算法以及分類迴歸樹（CART）、C5.0進行對比介紹，並對比C4.5與C5.0處理

使用R完成決策樹分類

關於決策樹理論方面的介紹，李航的《統計機器學習》第五章有很好的講解。傳統的ID3和C4.5一般用於分類問題，其中ID3使用資訊增益進行特徵選擇，即遞迴的選擇分類能力最強的特徵對資料進行分割，C4.5唯一不同的是使用資訊增益比進行特徵選擇。特徵A對訓練資料D的資訊增益g(

決策樹分類——matlab程式

%% 使用ID3決策樹演算法預測銷量高低 clc; clear ; %% 資料預處理 disp('正在進行資料預處理...'); [matrix,attributes_label,attributes] = id3_preprocess(); %% 構造ID3決策樹，其

影像資訊提取之——基於專家知識的決策樹分類

可以將多源資料用於影像分類當中，這就是專家知識的決策樹分類器，本專題以ENVI中Decision Tree為例來敘述這一分類器。本專題包括以下內容：專家知識分類器概述知識（規則）定義 ENVI中Decision Tree的使用概述基於知識的決策樹分

決策樹分類器（ID3、C4.5 Java實現）

分類什麼是分類？舉個例子，銀行貸款員需要分析資料，以便搞清楚哪些是貸款申請者是值得信賴的。通訊公司也希望能分清楚哪些客戶容易接受某一套餐，從而定向營銷。資料分類一般又包括學習階段（構建分類器）和分類階段（使用模型預測給定資料的類標號）。決策樹分類器

java寫的決策樹演算法（資料探勘演算法）

import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Iterator; //除錯過程中發現4個錯誤，感謝宇宙無敵的除錯工具——print //1、sele

決策樹分類鳶尾花資料demo

相關推薦