python中使用整合模型，隨機森林分類器，梯度提升決策樹效能模型分析視覺化

阿新 • • 發佈：2019-01-22

import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#titanic = pd.read_csv('../Datasets/Breast-Cancer/titanic.txt')

X=titanic[['pclass','age','sex']]
y=titanic['survived']
X.info()
X['age'].fillna(X['age'].mean(),inplace=True)
X.info()
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train=vec.fit_transform(X_train.to_dict(orient='record'))
print(vec.feature_names_)
X_test=vec.transform(X_test.to_dict(orient='record'))
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier() 
dtc.fit(X_train,y_train)
dtc_y_pred=dtc.predict(X_test)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_y_pred=rfc.predict(X_test)

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_y_pred = gbc.predict(X_test)



from sklearn.metrics import classification_report
print('The accuracy of decision tree is',dtc.score(X_test,y_test))

print(classification_report(dtc_y_pred,y_test))

print('The accuracy of random decision tree is',rfc.score(X_test,y_test))

print(classification_report(rfc_y_pred,y_test))
      

print('The accuracy of gradient forest tree is',gbc.score(X_test,y_test))

print(classification_report(gbc_y_pred,y_test))



from matplotlib import pyplot as plt
import numpy as np 

def show_values(pc, fmt="%.2f", **kw):
    '''
    Heatmap with text in each cell with matplotlib's pyplot
    Source: https://stackoverflow.com/a/25074150/395857 
    By HYRY
    '''
    global zip
    import  itertools
    zip = getattr(itertools, 'izip', zip)
    pc.update_scalarmappable()
    ax = pc.axes
    for p, color, value in  zip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
        x, y = p.vertices[:-2, :].mean(0)
        if np.all(color[:3] > 0.5):
            color = (0.0, 0.0, 0.0)
        else:
            color = (1.0, 1.0, 1.0)
        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)


def cm2inch(*tupl):
    '''
    Specify figure size in centimeter in matplotlib
    Source: https://stackoverflow.com/a/22787457/395857
    By gns-ank
    '''
    inch = 2.54
    if type(tupl[0]) == tuple:
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)


def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20, correct_orientation=False, cmap='RdBu'):
    '''
    Inspired by:
    - https://stackoverflow.com/a/16124677/395857 
    - https://stackoverflow.com/a/25074150/395857
    '''

    # Plot it out
    fig, ax = plt.subplots()    
    #c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)
    c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap=cmap)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)

    # set tick labels
    #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
    ax.set_xticklabels(xticklabels, minor=False)
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)      

    # Remove last blank column
    plt.xlim( (0, AUC.shape[1]) )

    # Turn off all the ticks
    ax = plt.gca()    
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # Add color bar
    plt.colorbar(c)

    # Add text in each cell 
    show_values(c)

    # Proper orientation (origin at the top left instead of bottom left)
    if correct_orientation:
        ax.invert_yaxis()
        ax.xaxis.tick_top()       

    # resize 
    fig = plt.gcf()
    #fig.set_size_inches(cm2inch(40, 20))
    #fig.set_size_inches(cm2inch(40*4, 20*4))
    fig.set_size_inches(cm2inch(figure_width, figure_height))



def plot_classification_report(classification_report, title='Classification report ', cmap='RdBu'):
    '''
    Plot scikit-learn classification report.
    Extension based on https://stackoverflow.com/a/31689645/395857 
    '''
    lines = classification_report.split('\n')

    classes = []
    plotMat = []
    support = []
    class_names = []
    for line in lines[2 : (len(lines) - 2)]:
        t = line.strip().split()
        if len(t) < 2: continue
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        support.append(int(t[-1]))
        class_names.append(t[0])
        print(v)
        plotMat.append(v)

    print('plotMat: {0}'.format(plotMat))
    print('support: {0}'.format(support))

    xlabel = 'Metrics'
    ylabel = 'Classes'
    xticklabels = ['Precision', 'Recall', 'F1-score']
    yticklabels = ['{0} ({1})'.format(class_names[idx], sup) for idx, sup  in enumerate(support)]
    figure_width = 25
    figure_height = len(class_names) + 7
    correct_orientation = False
    heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height, correct_orientation, cmap=cmap)

#傳入相應的report結果
def main():
    sampleClassificationReport =classification_report(dtc_y_pred,y_test)
    plot_classification_report(sampleClassificationReport)
    plt.savefig('decision_tree_report.png', dpi=200, format='png', bbox_inches='tight')
    plt.close()

    sampleClassificationReport1 =classification_report(rfc_y_pred,y_test)
    plot_classification_report(sampleClassificationReport1)
    plt.savefig('radom_forest_classifier_report.png', dpi=200, format='png', bbox_inches='tight')
    plt.close()

    sampleClassificationReport2 =classification_report(gbc_y_pred,y_test)
    plot_classification_report(sampleClassificationReport2)
    plt.savefig('gradient_tree_classifier_report.png', dpi=200, format='png', bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    main()
    #cProfile.run('main()') # if you want to do some profiling

輸出結果如下：
 File "D:\Python35\lib\urllib\request.py", line 1256, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error [WinError 10060] 由於連線方在一段時間後沒有正確答覆或連線的主機沒有反應，連線嘗試失敗。>

修改資料連線檔案：

titanic = pd.read_csv('../Datasets/Breast-Cancer/titanic.txt')

最後輸出結果如下：

The accuracy of decision tree is 0.7811550151975684
             precision    recall  f1-score   support

          0       0.91      0.78      0.84       236
          1       0.58      0.80      0.67        93

avg / total       0.81      0.78      0.79       329

The accuracy of random decision tree is 0.7781155015197568
             precision    recall  f1-score   support

          0       0.90      0.78      0.83       233
          1       0.59      0.78      0.67        96

avg / total       0.81      0.78      0.79       329

The accuracy of gradient forest tree is 0.790273556231003
             precision    recall  f1-score   support

          0       0.92      0.78      0.84       239
          1       0.58      0.82      0.68        90

avg / total       0.83      0.79      0.80       329

視覺化分析如下圖所示：

python中使用整合模型，隨機森林分類器，梯度提升決策樹效能模型分析視覺化

import pandas as pd titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') #titanic = pd.read_csv('.

scikit-learn /sklearn ：整合學習之隨機森林分類器（Forests of Randomized Tree）官方檔案翻譯

整合學習之隨機森林分類器整合學習的定義和分類。隨機森林法的定義和分類。隨機森林sklearn.ensemble.RandomForestClassifier()引數分類和含義。附註：Bias和Variance的含義和關係。一、整合學習（Ensemble

整合模型python實現，隨機森林，梯度提升決策樹

import pandas as pd; titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') X = titanic[['pclass'

python資料探勘入門與實踐--------電離層（Ionosphere）, scikit-learn估計器，K近鄰分類器，交叉檢驗，設定引數

ionosphere.data下載地址：http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ 原始碼及相關資料下載 https://github.com/xxg1413/MachineLea

梯度提升決策樹(Gradient Boosting Decision Tree)，用於分類或迴歸。

今天學習了梯度提升決策樹（Gradient Boosting Decision Tree, GBDT），準備寫點東西作為記錄。後續，我會用python 實現GBDT，釋出到我的Github上，敬請Star。梯度提升演算法是一種通用的學習演算法，除了決策樹，還可以使用其它模型作為基學習器。梯度提升演算法的

機器學習總結(四)——隨機森林與GBDT(梯度提升決策樹)

1. 隨機森林簡介隨機森林採用的是bagging的思想，bagging即：通過在訓練樣本集中進行有放回的取樣得到多個取樣集，基於每個取樣集訓練出一個基學習器，再將基學習器結合起來共同實現分類或者回歸。隨機森林在對決策樹進行bagging的基礎上，

【ML筆記】梯度提升決策樹（GBDT）和隨機森林（RF）的異同

GBDT和RF簡介 GBDT（Gradient Boosting Decision Tree） DT + Boosting = GBDT GBDT是一種boosting演算法。boosting工作機制：先從初始訓練集訓練處一個基學習器，然後在根據基學習器的表現對訓練樣本分佈

第4章決策樹演算法【分類】（五決策樹sklearn總結和視覺化總結）

4.7 決策樹sklearn總結參考文件：中文連結英文連結 API: 中文連結英文連結 scikit-learn決策樹演算法類庫內部實現是使用了調優過的CART樹演算法，既可以做分類，又可以做迴歸。分類決策樹的類對應的是DecisionTreeClass

Python實現決策樹並且使用Graphvize視覺化

一、什麼是決策樹（decision tree）——機器學習中的一個重要的分類演算法決策樹是一個類似於資料流程圖的樹結構：其中，每個內部節點表示一個屬性上的測試，每個分支代表一個屬性輸出，而每個樹葉結點代表類或者類的分佈，樹的最頂層是根結點根據天氣情況決定出遊與否的案例二、決策

【火爐煉AI】機器學習051-視覺詞袋模型+極端隨機森林建立圖像分類器

函數自然語言處理 3.6 權重 www. 語言 tar 一行序列【火爐煉AI】機器學習051-視覺詞袋模型+極端隨機森林建立圖像分類器 (本文所使用的Python庫和版本號: Python 3.6, Numpy 1.14, scikit-learn 0.19, mat

整合演算法（Bagging，隨機森林）

引言（關於整合學習）整合演算法包括很多種包括Bagging，隨機森林，Boosting 以及其他更加高效的整合演算法。在這篇部落格上只介紹Bagging演算法及隨機森林，Boosting提升演算法及其他高效的演算法在下一篇詳細講解。整合演算法就是通過構建多個學習器來完成學習任務，是由

機器學習：整合學習（ensemble)，bootstrap，Bagging，隨機森林，Boosting

文章目錄整合學習的樸素思想 Bootstrap理論 Bagging 隨機森林 Boosting 整合學習的樸素思想整合學習基於這樣的思想：對於比較複雜的任務，綜合許多人的意見來進行決策會比“一家獨大”要更好。換句話說、就

機器學習筆記-整合學習之Bagging，Boosting，隨機森林三者特性對比

整合學習的概念定義：整合學習通過構建並結合多個學習器來完成學習任務。分類：只包含同種型別的個體學習器，這樣的整合是“同質”的，例如都是神經網路或者決策樹；包含不同型別的個體學習器，這樣的整合是“異質”的，例如同時包括神經網路和決策樹。作用：整合學習通過將多個學

30分鐘學會用scikit-learn的基本回歸方法（線性、決策樹、SVM、KNN）和整合方法（隨機森林，Adaboost和GBRT）

注：本教程是本人嘗試使用scikit-learn的一些經驗，scikit-learn真的超級容易上手，簡單實用。30分鐘學會用呼叫基本的迴歸方法和整合方法應該是夠了。本文主要參考了scikit-learn的官方網站前言：本教程主要使用了numpy的最最基

利用隨機森林和梯度替身決策樹對titanic資料進行分類，並對結果進行分析

import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.feature_extraction import DictVectorizer from skle

貝葉斯分類器，隨機森林，梯度下載森林，神經網絡相關參數的意義和data leakage

就是抽取子集 width height 特征 rap 貝葉斯分類器技術分享構建的每一顆樹的數據都是有放回的隨機抽取的（也叫bootstrap),n_estimators參數是你想設置多少顆樹，還有就是在進行樹的結點

大白話5分鐘帶你走進人工智慧-第二十九節整合學習之隨機森林隨機方式，out of bag data及程式碼(2)

大白話5分鐘帶你走進人工智慧-第二十九節整合學習之隨機森林隨機方式，out of bag data及程式碼(2) 上一節中我們講解了隨機森林的基本概念，本節的話

Python中的Numpy(4.矩陣操作(算數運算，矩陣積，廣播機制))

1.基本的矩陣操作： '''1.算數運算子：加減乘除''' n1 = np.random.randint(0, 10, size=(4, 5)) print(n1) n2 = n1 + 10 # 對n1進行加法（減法，乘法，除法是一樣的用法） print(n2)

四分類：基本概念，決策樹與模型評估2

4.4模型的過分擬合分類模型的誤差分類：訓練誤差和泛化誤差過擬合：訓練誤差小，泛化能力弱造成過擬合的主要原因：模型複雜度 4.4.1噪聲導致的過分擬合由於擬合了誤分類（噪聲）的訓練記錄，導致了泛化誤差增大。 4.4.2缺乏代表性樣本導致的過分擬合由於訓練樣本太

四分類：基本概念，決策樹與模型評估1

4.1預備知識元組（x，y）：x指屬性集合，y指分類屬性目標函式又稱為分類模型：描述性建模；預測性建模 4.2 解決分類問題的一般方法分類技術是一種根據輸入資料集建立分類模型的系統方法。學習演算法確定分類模型；泛化能力模型訓練集；檢驗集分類模型效能評估： 1.正確

python中使用整合模型，隨機森林分類器，梯度提升決策樹效能模型分析 視覺化

相關推薦

python中使用整合模型，隨機森林分類器，梯度提升決策樹效能模型分析視覺化