機器學習EM實踐

阿新 • • 發佈：2018-12-02

EM.py

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances_argmin


mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False


if __name__ == '__main__':
    style = 'sklearn'

    np.random.seed(0)
    mu1_fact = (0, 0, 0)
    cov_fact = np.identity(3)
    data1 = np.random.multivariate_normal(mu1_fact, cov_fact, 400)
    mu2_fact = (2, 2, 1)
    cov_fact = np.identity(3)
    data2 = np.random.multivariate_normal(mu2_fact, cov_fact, 100)
    data = np.vstack((data1, data2))
    y = np.array([True] * 400 + [False] * 100)

    if style == 'sklearn':
        g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000)
        g.fit(data)
        print ('類別概率:\t', g.weights_[0])
        print ('均值:\n', g.means_, '\n')
        print ('方差:\n', g.covariances_, '\n')
        mu1, mu2 = g.means_
        sigma1, sigma2 = g.covariances_
    else:
        num_iter = 100
        n, d = data.shape
        # 隨機指定
        # mu1 = np.random.standard_normal(d)
        # print mu1
        # mu2 = np.random.standard_normal(d)
        # print mu2
        mu1 = data.min(axis=0)
        mu2 = data.max(axis=0)
        sigma1 = np.identity(d)
        sigma2 = np.identity(d)
        pi = 0.5
        # EM
        for i in range(num_iter):
            # E Step
            norm1 = multivariate_normal(mu1, sigma1)
            norm2 = multivariate_normal(mu2, sigma2)
            tau1 = pi * norm1.pdf(data)
            tau2 = (1 - pi) * norm2.pdf(data)
            gamma = tau1 / (tau1 + tau2)

            # M Step
            mu1 = np.dot(gamma, data) / np.sum(gamma)
            mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma))
            sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma)
            sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma)
            pi = np.sum(gamma) / n
            print (i, ":\t", mu1, mu2)
        print ('類別概率:\t', pi)
        print ('均值:\t', mu1, mu2)
        print ('方差:\n', sigma1, '\n\n', sigma2, '\n')

    # 預測分類
    norm1 = multivariate_normal(mu1, sigma1)
    norm2 = multivariate_normal(mu2, sigma2)
    tau1 = norm1.pdf(data)
    tau2 = norm2.pdf(data)

    fig = plt.figure(figsize=(13, 7), facecolor='w')
    ax = fig.add_subplot(121, projection='3d')
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', depthshade=True)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title(u'原始資料', fontsize=18)
    ax = fig.add_subplot(122, projection='3d')
    order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean')
    if order[0] == 0:
        c1 = tau1 > tau2
    else:
        c1 = tau1 < tau2
    c2 = ~c1
    acc = np.mean(y == c1)
    print (u'準確率：%.2f%%' % (100*acc))
    ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', depthshade=True)
    ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', depthshade=True)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title(u'EM演算法分類', fontsize=18)
    # plt.suptitle(u'EM演算法的實現', fontsize=20)
    # plt.subplots_adjust(top=0.92)
    plt.tight_layout()
    plt.show()

GMM.py

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt

mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# from matplotlib.font_manager import FontProperties
# font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15)
# fontproperties=font_set


def expand(a, b):
    d = (b - a) * 0.05
    return a-d, b+d


if __name__ == '__main__':
    data = np.loadtxt('18.HeightWeight.csv', dtype=np.float, delimiter=',', skiprows=1)
    print (data.shape)
    y, x = np.split(data, [1, ], axis=1)
    x, x_test, y, y_test = train_test_split(x, y, train_size=0.6, random_state=0)
    gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0)
    x_min = np.min(x, axis=0)
    x_max = np.max(x, axis=0)
    gmm.fit(x)
    print ('均值 = \n', gmm.means_)
    print ('方差 = \n', gmm.covariances_)
    y_hat = gmm.predict(x)
    y_test_hat = gmm.predict(x_test)
    change = (gmm.means_[0][0] > gmm.means_[1][0])
    if change:
        z = y_hat == 0
        y_hat[z] = 1
        y_hat[~z] = 0
        z = y_test_hat == 0
        y_test_hat[z] = 1
        y_test_hat[~z] = 0
    acc = np.mean(y_hat.ravel() == y.ravel())
    acc_test = np.mean(y_test_hat.ravel() == y_test.ravel())
    acc_str = u'訓練集準確率：%.2f%%' % (acc * 100)
    acc_test_str = u'測試集準確率：%.2f%%' % (acc_test * 100)
    print (acc_str)
    print (acc_test_str)

    cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0'])
    cm_dark = mpl.colors.ListedColormap(['r', 'g'])
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
    grid_test = np.stack((x1.flat, x2.flat), axis=1)
    grid_hat = gmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    if change:
        z = grid_hat == 0
        grid_hat[z] = 1
        grid_hat[~z] = 0
    plt.figure(figsize=(9, 7), facecolor='w')
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)
    plt.scatter(x[:, 0], x[:, 1], s=50, c=np.squeeze(y), marker='o', cmap=cm_dark, edgecolors='k')
    plt.scatter(x_test[:, 0], x_test[:, 1], s=60, c=np.squeeze(y_test), marker='^', cmap=cm_dark, edgecolors='k')

    p = gmm.predict_proba(grid_test)
    p = p[:, 0].reshape(x1.shape)
    CS = plt.contour(x1, x2, p, levels=(0.2, 0.5, 0.8), colors=list('rgb'), linewidths=2)
    plt.clabel(CS, fontsize=15, fmt='%.1f', inline=True)
    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    xx = 0.9*ax1_min + 0.1*ax1_max
    yy = 0.1*ax2_min + 0.9*ax2_max
    plt.text(xx, yy, acc_str, fontsize=18)
    yy = 0.15*ax2_min + 0.85*ax2_max
    plt.text(xx, yy, acc_test_str, fontsize=18)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.xlabel(u'身高(cm)', fontsize='large')
    plt.ylabel(u'體重(kg)', fontsize='large')
    plt.title(u'EM演算法估算GMM的引數', fontsize=20)
    plt.grid()
    plt.show()

GMM_Parameter.py

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt

mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False


def expand(a, b, rate=0.05):
    d = (b - a) * rate
    return a-d, b+d


def accuracy_rate(y1, y2):
    acc = np.mean(y1 == y2)
    return acc if acc > 0.5 else 1-acc


if __name__ == '__main__':
    np.random.seed(0)
    cov1 = np.diag((1, 2))
    N1 = 500
    N2 = 300
    N = N1 + N2
    x1 = np.random.multivariate_normal(mean=(1, 2), cov=cov1, size=N1)
    m = np.array(((1, 1), (1, 3)))
    x1 = x1.dot(m)
    x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
    x = np.vstack((x1, x2))
    y = np.array([0]*N1 + [1]*N2)

    types = ('spherical', 'diag', 'tied', 'full')
    err = np.empty(len(types))
    bic = np.empty(len(types))
    for i, type in enumerate(types):
        gmm = GaussianMixture(n_components=2, covariance_type=type, random_state=0)
        gmm.fit(x)
        err[i] = 1 - accuracy_rate(gmm.predict(x), y)
        bic[i] = gmm.bic(x)
    print ('錯誤率：', err.ravel())
    print ('BIC：', bic.ravel())
    xpos = np.arange(4)
    ax = plt.axes()
    b1 = ax.bar(xpos-0.3, err, width=0.3, color='#77E0A0')
    b2 = ax.twinx().bar(xpos, bic, width=0.3, color='#FF8080')
    plt.grid(True)
    bic_min, bic_max = expand(bic.min(), bic.max())
    plt.ylim((bic_min, bic_max))
    plt.xticks(xpos, types)
    plt.legend([b1[0], b2[0]], (u'錯誤率', u'BIC'))
    plt.title(u'不同方差型別的誤差率和BIC', fontsize=18)
    plt.show()

    optimal = bic.argmin()
    gmm = GaussianMixture(n_components=2, covariance_type=types[optimal], random_state=0)
    gmm.fit(x)
    print ('均值 = \n', gmm.means_)
    print ('方差 = \n', gmm.covariances_)
    y_hat = gmm.predict(x)

    cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0'])
    cm_dark = mpl.colors.ListedColormap(['r', 'g'])
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
    grid_test = np.stack((x1.flat, x2.flat), axis=1)
    grid_hat = gmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    if gmm.means_[0][0] > gmm.means_[1][0]:
        z = grid_hat == 0
        grid_hat[z] = 1
        grid_hat[~z] = 0
    plt.figure(figsize=(9, 7), facecolor='w')
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)
    plt.scatter(x[:, 0], x[:, 1], s=30, c=np.squeeze(y), marker='o', cmap=cm_dark, edgecolors='k')

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title(u'GMM調參：covariance_type=%s' % types[optimal], fontsize=20)
    plt.grid()
    plt.show()

GMM_Iris.py

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances_argmin

mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

iris_feature = u'花萼長度', u'花萼寬度', u'花瓣長度', u'花瓣寬度'


def expand(a, b, rate=0.05):
    d = (b - a) * rate
    return a-d, b+d


def iris_type(s):
    it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
    return it[s]


if __name__ == '__main__':
    path = '..\\8.Regression\\8.iris.data'  # 資料檔案路徑
    data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
    # 將資料的0到3列組成x，第4列得到y
    x_prime, y = np.split(data, (4,), axis=1)
    y = y.ravel()

    n_components = 3
    feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
    plt.figure(figsize=(10, 9), facecolor='#FFFFFF')
    for k, pair in enumerate(feature_pairs):
        x = x_prime[:, pair]
        m = np.array([np.mean(x[y == i], axis=0) for i in range(3)])  # 均值的實際值
        print('實際均值 = \n', m)

        gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
        gmm.fit(x)
        print ('預測均值 = \n', gmm.means_)
        print ('預測方差 = \n', gmm.covariances_)
        y_hat = gmm.predict(x)
        order = pairwise_distances_argmin(m, gmm.means_, axis=1, metric='euclidean')
        # print '順序：\t', order

        n_sample = y.size
        n_types = 3
        change = np.empty((n_types, n_sample), dtype=np.bool)
        for i in range(n_types):
            change[i] = y_hat == order[i]
        for i in range(n_types):
            y_hat[change[i]] = i
        acc = u'準確率：%.2f%%' % (100*np.mean(y_hat == y))
        print (acc)

        cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0', '#A0A0FF'])
        cm_dark = mpl.colors.ListedColormap(['r', 'g', '#6060FF'])
        x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
        x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
        grid_test = np.stack((x1.flat, x2.flat), axis=1)
        grid_hat = gmm.predict(grid_test)

        change = np.empty((n_types, grid_hat.size), dtype=np.bool)
        for i in range(n_types):
            change[i] = grid_hat == order[i]
        for i in range(n_types):
            grid_hat[change[i]] = i

        grid_hat = grid_hat.reshape(x1.shape)
        plt.subplot(3, 2, k+1)
        plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)
        plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark, edgecolors='k')
        xx = 0.95 * x1_min + 0.05 * x1_max
        yy = 0.1 * x2_min + 0.9 * x2_max
        plt.text(xx, yy, acc, fontsize=14)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.xlabel(iris_feature[pair[0]], fontsize=14)
        plt.ylabel(iris_feature[pair[1]], fontsize=14)
        plt.grid()
    plt.tight_layout(2)
    plt.suptitle(u'EM演算法無監督分類鳶尾花資料', fontsize=20)
    plt.subplots_adjust(top=0.92)
    plt.show()

DPGMM.py

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import scipy as sp
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse


def expand(a, b, rate=0.05):
    d = (b - a) * rate
    return a-d, b+d


matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False


if __name__ == '__main__':
    np.random.seed(0)
    cov1 = np.diag((1, 2))
    N1 = 500
    N2 = 300
    N = N1 + N2
    x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1)
    m = np.array(((1, 1), (1, 3)))
    x1 = x1.dot(m)
    x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
    x = np.vstack((x1, x2))
    y = np.array([0]*N1 + [1]*N2)
    n_components = 3

    # 繪圖使用
    colors = '#A0FFA0', '#2090E0', '#FF8080'
    cm = mpl.colors.ListedColormap(colors)
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
    grid_test = np.stack((x1.flat, x2.flat), axis=1)

    plt.figure(figsize=(9, 9), facecolor='w')
    plt.suptitle(u'GMM/DPGMM比較', fontsize=23)

    ax = plt.subplot(211)
    gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
    gmm.fit(x)
    centers = gmm.means_
    covs = gmm.covariances_
    print ('GMM均值 = \n', centers)
    print ('GMM方差 = \n', covs)
    y_hat = gmm.predict(x)

    grid_hat = gmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')

    clrs = list('rgbmy')
    for i, cc in enumerate(zip(centers, covs)):
        center, cov = cc
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title(u'GMM', fontsize=20)
    plt.grid(True)

    # DPGMM
    dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
                                    weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=10)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm.covariances_
    print ('DPGMM均值 = \n', centers)
    print ('DPGMM方差 = \n', covs)
    y_hat = dpgmm.predict(x)
    # print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')

    for i, cc in enumerate(zip(centers, covs)):
        if i not in y_hat:
            continue
        center, cov = cc
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color='m', alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title('DPGMM', fontsize=20)
    plt.grid(True)

    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

GMM_pdf.py

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from sklearn.mixture import GaussianMixture
import scipy as sp
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import warnings


def expand(a, b, rate=0.05):
    d = (b - a) * rate
    return a-d, b+d


if __name__ == '__main__':
    warnings.filterwarnings(action='ignore', category=RuntimeWarning)
    np.random.seed(0)
    cov1 = np.diag((1, 2))
    N1 = 500
    N2 = 300
    N = N1 + N2
    x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1)
    m = np.array(((1, 1), (1, 3)))
    x1 = x1.dot(m)
    x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
    x = np.vstack((x1, x2))
    y = np.array([0]*N1 + [1]*N2)

    gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0)
    gmm.fit(x)
    centers = gmm.means_
    covs = gmm.covariances_
    print ('GMM均值 = \n', centers)
    print ('GMM方差 = \n', covs)
    y_hat = gmm.predict(x)

    colors = '#A0FFA0', '#FF8080',
    levels = 10
    cm = mpl.colors.ListedColormap(colors)
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
    grid_test = np.stack((x1.flat, x2.flat), axis=1)
    print (gmm.score_samples(grid_test))
    grid_hat = -gmm.score_samples(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.figure(figsize=(9, 7), facecolor='w')
    ax = plt.subplot(111)
    cmesh = plt.pcolormesh(x1, x2, grid_hat, cmap=plt.cm.Spectral)
    plt.colorbar(cmesh, shrink=0.8)
    CS = plt.contour(x1, x2, grid_hat, levels=np.logspace(0, 2, num=levels, base=10), colors='w', linewidths=1)
    plt.clabel(CS, fontsize=9, inline=1, fmt='%.1f')
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')

    for i, cc in enumerate(zip(centers, covs)):
        center, cov = cc
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color='m', alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    mpl.rcParams['font.sans-serif'] = [u'SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.title(u'GMM似然函式值', fontsize=20)
    plt.grid(True)
    plt.show()

機器學習EM實踐

EM.py # !/usr/bin/python # -*- coding:utf-8 -*- import numpy as np from scipy.stats import multivariate_normal from sklearn.mixture import Gaussian

【機器學習PAI實踐二】人口普查統計

區域 targe 使用方式 team 包含 csdn 作者我們行處理一、背景感謝大家關註玩轉數據系列文章，我們希望通過在阿裏雲機器學習平臺上提供demo數據並搭建相關的實驗流程的方式來幫助大家學習如何通過算法來挖掘數據中的價值。本系列文章包含詳細的實驗流程以及相關

[機器學習python實踐(5)]Sklearn實現集成

ace 訓練存在 edi 每一個 predict utf-8 avg score 1,集成集成（Ensemble）分類模型是綜合考量多個分類器的預測結果，從而做出決策。一般分為兩種方式：1）利用相同的訓練數據同時搭建多個獨立的分類模型，然後通過投票的方式，以少數服從多數

[python機器學習及實踐(6)]Sklearn實現主成分分析（PCA）

相關性 hit 變量 gray tran total 空間 mach show 1.PCA原理主成分分析（Principal Component Analysis，PCA），是一種統計方法。通過正交變換將一組可能存在相關性的變量轉換為一組線性不相關的變量，轉換後的這組

機器學習——Perceptron實踐

!= col cor mage append nbsp .data arr div 一、題目：使用自己寫的感知機Perceptron實現對鳶尾花數據的分類。　　數據來源： from sklearn.datasets import load_iris dataSet =

重回機器學習-《python機器學習及實踐》讀書筆記二

一.三個率機器學習模型訓練好之後，會在樣本外進行測試，然後我們可以得到三個“率”：準確率召回率精確率其實這些也沒有什麼大不了的，大家如果學習

PYTHON機器學習及實踐_從零開始通往KAGGLE競賽之路pdf

【下載地址】本書面向所有對機器學習與資料探勘的實踐及競賽感興趣的讀者，從零開始，以Python程式語言為基礎，在不涉及大量數學模型與複雜程式設計知識的前提下，逐步帶領讀者熟悉並且掌握當下最流行的機器學習、數learn作為基礎機器學習工具；第3章進階篇，涉及怎樣藉助高階技術或者模型進一步提升既有機器學習系統的

python機器學習及實踐學習筆記1-如何開啟ipynb字尾檔案

python機器學習及實踐學習筆記1-如何開啟ipynb字尾檔案 2017年02月22日 14:58:08 hustzhoutian 閱讀數：45365更多個人分類：深度學習需要安裝ipython notebook，如果你已經安裝Anaconda

Python機器學習及實踐——基礎篇11（迴歸樹）

迴歸樹在選擇不同特徵作為分裂節點的策略上，與基礎篇6的決策樹的思路類似。不同之處在於，迴歸樹葉節點的資料型別不是離散型，而是連續型。決策樹每個葉節點依照訓練資料表現的概率傾向決定了其最終的預測類；而回歸樹的葉節點確實一個個具體的值，從預測值連續這個意義上嚴格地講，迴歸樹不能成

機器學習應用實踐難？看完這十大訣竅瞬間懂了！

點選上方 ⬆ 藍字關注七月線上實驗室來源InfoWorld | 作者Alexander

PMML模型檔案在機器學習的實踐經驗

這種方案，在本次參加 QCon 大會時，Paypal的機器學習平臺中也有所提及： PMML 預測模型標記語言(Predictive Model Markup Language,PMML)是一種可以呈現預測分析模型的事實標準語言。標準東西的好處就是，各種開發語言都可以使用

【機器學習PAI實踐十二】機器學習演算法基於信用卡消費記錄做信用評分

背景如果你是做網際網路金融的，那麼一定聽說過評分卡。評分卡是信用風險評估領域常用的建模方法，評分卡並不簡單對應於某一種機器學習演算法，而是一種通用的建模框架，將原始資料通過分箱後進行特徵工程變換，繼而應用於線性模型進行建模的一種方法。評分卡建模理論常

【機器學習PAI實踐七】文字分析演算法實現新聞自動分類

一、背景新聞分類是文字挖掘領域較為常見的場景。目前很多媒體或是內容生產商對於新聞這種文字的分類常常採用人肉打標的方式，消耗了大量的人力資源。本文嘗試通過智慧的文字挖掘演算法對於新聞文字進行分類。無需任何人肉打標，完全由機器智慧化實現。本文通過PLDA演算

Python機器學習演算法實踐——k均值聚類（k-means）

一開始的目的是學習十大挖掘演算法（機器學習演算法）,並用編碼實現一遍，但越往後學習，越往後實現編碼，越發現自己的編碼水平低下，學習能力低。這一個k-means演算法用Python實現竟用了三天時間，可見編碼水平之低，而且在編碼的過程中看了別人的編碼，才發現自己對

Python機器學習及實踐——基礎篇7（分類整合模型）

常言道：“一個籬笆三個樁，一個好漢三個幫”。整合分類模型便是綜合考量多個分類器的預測結果，從而做出決策。只是這種“綜合考量”的方式大體上分為兩種：一種是利用相同的訓練資料同時搭建多個獨立的分類模型，然後通過投票的方式，以少數服從多數的原則作出最終的分類決策。比

《Python機器學習及實踐》----無監督學習之資料聚類

本片部落格是根據《Python機器學習及實踐》一書中的例項，所有程式碼均在本地編譯通過。資料為從該書指定的百度網盤上下載的，或者是sklearn自帶資料下載到本地使用的。程式碼片段： # coding: utf-8 # 分別匯入numpy、matplot

Python機器學習及實踐——基礎篇10（K近鄰迴歸）

在基礎篇5中提到裡這類模型不需要訓練引數的特點。在迴歸任務重，k近鄰（迴歸）模型同樣只是藉助周圍K個最近訓練樣本的目標數值，對待測樣本的迴歸值進行決策。自然，也衍生出衡量待測樣吧迴歸值的不同方式，即到底是對K個近鄰目標數值使用普通的算術平均演算法，還是同時考慮距離的差

Python機器學習演算法實踐——梯度上升演算法

一：理論部分給定一個樣本集，每個樣本點有兩個維度值（X1，X2）和一個類別值，類別只有兩類，我們以0和1代表。資料如下所示：樣本 X1 X2 類別 1

《Python機器學習及實踐》----模型實用技巧

本片部落格是根據《Python機器學習及實踐》一書中的例項，所有程式碼均在本地編譯通過。資料為從該書指定的百度網盤上下載的，或者是sklearn自帶資料下載到本地使用的。程式碼片段： measurements = [{'city': 'Dubai',

python機器學習及實踐第二章的2.1.2.1線性迴歸器程式報錯Reshape your data either using array.reshap(-1,1)的原因及解決方法

最近在看Python機器學習及實踐（從零開始kaggle競賽之路）這本書，到了第二章的線性迴歸器的GradientBoostingRegressor模型照著敲程式碼的時候出現了以下的錯誤出錯的問題在於標準化函式這裡。可見fit_tran

機器學習EM實踐

相關推薦