機器學習EM實踐
阿新 • • 發佈:2018-12-02
EM.py
# !/usr/bin/python # -*- coding:utf-8 -*- import numpy as np from scipy.stats import multivariate_normal from sklearn.mixture import GaussianMixture from mpl_toolkits.mplot3d import Axes3D import matplotlib as mpl import matplotlib.pyplot as plt from sklearn.metrics.pairwise import pairwise_distances_argmin mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False if __name__ == '__main__': style = 'sklearn' np.random.seed(0) mu1_fact = (0, 0, 0) cov_fact = np.identity(3) data1 = np.random.multivariate_normal(mu1_fact, cov_fact, 400) mu2_fact = (2, 2, 1) cov_fact = np.identity(3) data2 = np.random.multivariate_normal(mu2_fact, cov_fact, 100) data = np.vstack((data1, data2)) y = np.array([True] * 400 + [False] * 100) if style == 'sklearn': g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000) g.fit(data) print ('類別概率:\t', g.weights_[0]) print ('均值:\n', g.means_, '\n') print ('方差:\n', g.covariances_, '\n') mu1, mu2 = g.means_ sigma1, sigma2 = g.covariances_ else: num_iter = 100 n, d = data.shape # 隨機指定 # mu1 = np.random.standard_normal(d) # print mu1 # mu2 = np.random.standard_normal(d) # print mu2 mu1 = data.min(axis=0) mu2 = data.max(axis=0) sigma1 = np.identity(d) sigma2 = np.identity(d) pi = 0.5 # EM for i in range(num_iter): # E Step norm1 = multivariate_normal(mu1, sigma1) norm2 = multivariate_normal(mu2, sigma2) tau1 = pi * norm1.pdf(data) tau2 = (1 - pi) * norm2.pdf(data) gamma = tau1 / (tau1 + tau2) # M Step mu1 = np.dot(gamma, data) / np.sum(gamma) mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma)) sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma) sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma) pi = np.sum(gamma) / n print (i, ":\t", mu1, mu2) print ('類別概率:\t', pi) print ('均值:\t', mu1, mu2) print ('方差:\n', sigma1, '\n\n', sigma2, '\n') # 預測分類 norm1 = multivariate_normal(mu1, sigma1) norm2 = multivariate_normal(mu2, sigma2) tau1 = norm1.pdf(data) tau2 = norm2.pdf(data) fig = plt.figure(figsize=(13, 7), facecolor='w') ax = fig.add_subplot(121, projection='3d') ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', depthshade=True) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_title(u'原始資料', fontsize=18) ax = fig.add_subplot(122, projection='3d') order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean') if order[0] == 0: c1 = tau1 > tau2 else: c1 = tau1 < tau2 c2 = ~c1 acc = np.mean(y == c1) print (u'準確率:%.2f%%' % (100*acc)) ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', depthshade=True) ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', depthshade=True) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_title(u'EM演算法分類', fontsize=18) # plt.suptitle(u'EM演算法的實現', fontsize=20) # plt.subplots_adjust(top=0.92) plt.tight_layout() plt.show()
GMM.py
# !/usr/bin/python # -*- coding:utf-8 -*- import numpy as np from sklearn.mixture import GaussianMixture from sklearn.model_selection import train_test_split import matplotlib as mpl import matplotlib.colors import matplotlib.pyplot as plt mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False # from matplotlib.font_manager import FontProperties # font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15) # fontproperties=font_set def expand(a, b): d = (b - a) * 0.05 return a-d, b+d if __name__ == '__main__': data = np.loadtxt('18.HeightWeight.csv', dtype=np.float, delimiter=',', skiprows=1) print (data.shape) y, x = np.split(data, [1, ], axis=1) x, x_test, y, y_test = train_test_split(x, y, train_size=0.6, random_state=0) gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0) x_min = np.min(x, axis=0) x_max = np.max(x, axis=0) gmm.fit(x) print ('均值 = \n', gmm.means_) print ('方差 = \n', gmm.covariances_) y_hat = gmm.predict(x) y_test_hat = gmm.predict(x_test) change = (gmm.means_[0][0] > gmm.means_[1][0]) if change: z = y_hat == 0 y_hat[z] = 1 y_hat[~z] = 0 z = y_test_hat == 0 y_test_hat[z] = 1 y_test_hat[~z] = 0 acc = np.mean(y_hat.ravel() == y.ravel()) acc_test = np.mean(y_test_hat.ravel() == y_test.ravel()) acc_str = u'訓練集準確率:%.2f%%' % (acc * 100) acc_test_str = u'測試集準確率:%.2f%%' % (acc_test * 100) print (acc_str) print (acc_test_str) cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0']) cm_dark = mpl.colors.ListedColormap(['r', 'g']) x1_min, x1_max = x[:, 0].min(), x[:, 0].max() x2_min, x2_max = x[:, 1].min(), x[:, 1].max() x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] grid_test = np.stack((x1.flat, x2.flat), axis=1) grid_hat = gmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) if change: z = grid_hat == 0 grid_hat[z] = 1 grid_hat[~z] = 0 plt.figure(figsize=(9, 7), facecolor='w') plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) plt.scatter(x[:, 0], x[:, 1], s=50, c=np.squeeze(y), marker='o', cmap=cm_dark, edgecolors='k') plt.scatter(x_test[:, 0], x_test[:, 1], s=60, c=np.squeeze(y_test), marker='^', cmap=cm_dark, edgecolors='k') p = gmm.predict_proba(grid_test) p = p[:, 0].reshape(x1.shape) CS = plt.contour(x1, x2, p, levels=(0.2, 0.5, 0.8), colors=list('rgb'), linewidths=2) plt.clabel(CS, fontsize=15, fmt='%.1f', inline=True) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() xx = 0.9*ax1_min + 0.1*ax1_max yy = 0.1*ax2_min + 0.9*ax2_max plt.text(xx, yy, acc_str, fontsize=18) yy = 0.15*ax2_min + 0.85*ax2_max plt.text(xx, yy, acc_test_str, fontsize=18) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.xlabel(u'身高(cm)', fontsize='large') plt.ylabel(u'體重(kg)', fontsize='large') plt.title(u'EM演算法估算GMM的引數', fontsize=20) plt.grid() plt.show()
GMM_Parameter.py
# !/usr/bin/python # -*- coding:utf-8 -*- import numpy as np from sklearn.mixture import GaussianMixture import matplotlib as mpl import matplotlib.colors import matplotlib.pyplot as plt mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False def expand(a, b, rate=0.05): d = (b - a) * rate return a-d, b+d def accuracy_rate(y1, y2): acc = np.mean(y1 == y2) return acc if acc > 0.5 else 1-acc if __name__ == '__main__': np.random.seed(0) cov1 = np.diag((1, 2)) N1 = 500 N2 = 300 N = N1 + N2 x1 = np.random.multivariate_normal(mean=(1, 2), cov=cov1, size=N1) m = np.array(((1, 1), (1, 3))) x1 = x1.dot(m) x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2) x = np.vstack((x1, x2)) y = np.array([0]*N1 + [1]*N2) types = ('spherical', 'diag', 'tied', 'full') err = np.empty(len(types)) bic = np.empty(len(types)) for i, type in enumerate(types): gmm = GaussianMixture(n_components=2, covariance_type=type, random_state=0) gmm.fit(x) err[i] = 1 - accuracy_rate(gmm.predict(x), y) bic[i] = gmm.bic(x) print ('錯誤率:', err.ravel()) print ('BIC:', bic.ravel()) xpos = np.arange(4) ax = plt.axes() b1 = ax.bar(xpos-0.3, err, width=0.3, color='#77E0A0') b2 = ax.twinx().bar(xpos, bic, width=0.3, color='#FF8080') plt.grid(True) bic_min, bic_max = expand(bic.min(), bic.max()) plt.ylim((bic_min, bic_max)) plt.xticks(xpos, types) plt.legend([b1[0], b2[0]], (u'錯誤率', u'BIC')) plt.title(u'不同方差型別的誤差率和BIC', fontsize=18) plt.show() optimal = bic.argmin() gmm = GaussianMixture(n_components=2, covariance_type=types[optimal], random_state=0) gmm.fit(x) print ('均值 = \n', gmm.means_) print ('方差 = \n', gmm.covariances_) y_hat = gmm.predict(x) cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0']) cm_dark = mpl.colors.ListedColormap(['r', 'g']) x1_min, x1_max = x[:, 0].min(), x[:, 0].max() x2_min, x2_max = x[:, 1].min(), x[:, 1].max() x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] grid_test = np.stack((x1.flat, x2.flat), axis=1) grid_hat = gmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) if gmm.means_[0][0] > gmm.means_[1][0]: z = grid_hat == 0 grid_hat[z] = 1 grid_hat[~z] = 0 plt.figure(figsize=(9, 7), facecolor='w') plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) plt.scatter(x[:, 0], x[:, 1], s=30, c=np.squeeze(y), marker='o', cmap=cm_dark, edgecolors='k') ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title(u'GMM調參:covariance_type=%s' % types[optimal], fontsize=20) plt.grid() plt.show()
GMM_Iris.py
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances_argmin
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
iris_feature = u'花萼長度', u'花萼寬度', u'花瓣長度', u'花瓣寬度'
def expand(a, b, rate=0.05):
d = (b - a) * rate
return a-d, b+d
def iris_type(s):
it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
return it[s]
if __name__ == '__main__':
path = '..\\8.Regression\\8.iris.data' # 資料檔案路徑
data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
# 將資料的0到3列組成x,第4列得到y
x_prime, y = np.split(data, (4,), axis=1)
y = y.ravel()
n_components = 3
feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
plt.figure(figsize=(10, 9), facecolor='#FFFFFF')
for k, pair in enumerate(feature_pairs):
x = x_prime[:, pair]
m = np.array([np.mean(x[y == i], axis=0) for i in range(3)]) # 均值的實際值
print('實際均值 = \n', m)
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
gmm.fit(x)
print ('預測均值 = \n', gmm.means_)
print ('預測方差 = \n', gmm.covariances_)
y_hat = gmm.predict(x)
order = pairwise_distances_argmin(m, gmm.means_, axis=1, metric='euclidean')
# print '順序:\t', order
n_sample = y.size
n_types = 3
change = np.empty((n_types, n_sample), dtype=np.bool)
for i in range(n_types):
change[i] = y_hat == order[i]
for i in range(n_types):
y_hat[change[i]] = i
acc = u'準確率:%.2f%%' % (100*np.mean(y_hat == y))
print (acc)
cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['r', 'g', '#6060FF'])
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
grid_test = np.stack((x1.flat, x2.flat), axis=1)
grid_hat = gmm.predict(grid_test)
change = np.empty((n_types, grid_hat.size), dtype=np.bool)
for i in range(n_types):
change[i] = grid_hat == order[i]
for i in range(n_types):
grid_hat[change[i]] = i
grid_hat = grid_hat.reshape(x1.shape)
plt.subplot(3, 2, k+1)
plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)
plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark, edgecolors='k')
xx = 0.95 * x1_min + 0.05 * x1_max
yy = 0.1 * x2_min + 0.9 * x2_max
plt.text(xx, yy, acc, fontsize=14)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.xlabel(iris_feature[pair[0]], fontsize=14)
plt.ylabel(iris_feature[pair[1]], fontsize=14)
plt.grid()
plt.tight_layout(2)
plt.suptitle(u'EM演算法無監督分類鳶尾花資料', fontsize=20)
plt.subplots_adjust(top=0.92)
plt.show()
DPGMM.py
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import scipy as sp
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
def expand(a, b, rate=0.05):
d = (b - a) * rate
return a-d, b+d
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
if __name__ == '__main__':
np.random.seed(0)
cov1 = np.diag((1, 2))
N1 = 500
N2 = 300
N = N1 + N2
x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1)
m = np.array(((1, 1), (1, 3)))
x1 = x1.dot(m)
x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
x = np.vstack((x1, x2))
y = np.array([0]*N1 + [1]*N2)
n_components = 3
# 繪圖使用
colors = '#A0FFA0', '#2090E0', '#FF8080'
cm = mpl.colors.ListedColormap(colors)
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
grid_test = np.stack((x1.flat, x2.flat), axis=1)
plt.figure(figsize=(9, 9), facecolor='w')
plt.suptitle(u'GMM/DPGMM比較', fontsize=23)
ax = plt.subplot(211)
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
gmm.fit(x)
centers = gmm.means_
covs = gmm.covariances_
print ('GMM均值 = \n', centers)
print ('GMM方差 = \n', covs)
y_hat = gmm.predict(x)
grid_hat = gmm.predict(grid_test)
grid_hat = grid_hat.reshape(x1.shape)
plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')
clrs = list('rgbmy')
for i, cc in enumerate(zip(centers, covs)):
center, cov = cc
value, vector = sp.linalg.eigh(cov)
width, height = value[0], value[1]
v = vector[0] / sp.linalg.norm(vector[0])
angle = 180* np.arctan(v[1] / v[0]) / np.pi
e = Ellipse(xy=center, width=width, height=height,
angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
ax.add_artist(e)
ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.title(u'GMM', fontsize=20)
plt.grid(True)
# DPGMM
dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=10)
dpgmm.fit(x)
centers = dpgmm.means_
covs = dpgmm.covariances_
print ('DPGMM均值 = \n', centers)
print ('DPGMM方差 = \n', covs)
y_hat = dpgmm.predict(x)
# print y_hat
ax = plt.subplot(212)
grid_hat = dpgmm.predict(grid_test)
grid_hat = grid_hat.reshape(x1.shape)
plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')
for i, cc in enumerate(zip(centers, covs)):
if i not in y_hat:
continue
center, cov = cc
value, vector = sp.linalg.eigh(cov)
width, height = value[0], value[1]
v = vector[0] / sp.linalg.norm(vector[0])
angle = 180* np.arctan(v[1] / v[0]) / np.pi
e = Ellipse(xy=center, width=width, height=height,
angle=angle, color='m', alpha=0.5, clip_box = ax.bbox)
ax.add_artist(e)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.title('DPGMM', fontsize=20)
plt.grid(True)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
GMM_pdf.py
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.mixture import GaussianMixture
import scipy as sp
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import warnings
def expand(a, b, rate=0.05):
d = (b - a) * rate
return a-d, b+d
if __name__ == '__main__':
warnings.filterwarnings(action='ignore', category=RuntimeWarning)
np.random.seed(0)
cov1 = np.diag((1, 2))
N1 = 500
N2 = 300
N = N1 + N2
x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1)
m = np.array(((1, 1), (1, 3)))
x1 = x1.dot(m)
x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
x = np.vstack((x1, x2))
y = np.array([0]*N1 + [1]*N2)
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0)
gmm.fit(x)
centers = gmm.means_
covs = gmm.covariances_
print ('GMM均值 = \n', centers)
print ('GMM方差 = \n', covs)
y_hat = gmm.predict(x)
colors = '#A0FFA0', '#FF8080',
levels = 10
cm = mpl.colors.ListedColormap(colors)
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
grid_test = np.stack((x1.flat, x2.flat), axis=1)
print (gmm.score_samples(grid_test))
grid_hat = -gmm.score_samples(grid_test)
grid_hat = grid_hat.reshape(x1.shape)
plt.figure(figsize=(9, 7), facecolor='w')
ax = plt.subplot(111)
cmesh = plt.pcolormesh(x1, x2, grid_hat, cmap=plt.cm.Spectral)
plt.colorbar(cmesh, shrink=0.8)
CS = plt.contour(x1, x2, grid_hat, levels=np.logspace(0, 2, num=levels, base=10), colors='w', linewidths=1)
plt.clabel(CS, fontsize=9, inline=1, fmt='%.1f')
plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')
for i, cc in enumerate(zip(centers, covs)):
center, cov = cc
value, vector = sp.linalg.eigh(cov)
width, height = value[0], value[1]
v = vector[0] / sp.linalg.norm(vector[0])
angle = 180* np.arctan(v[1] / v[0]) / np.pi
e = Ellipse(xy=center, width=width, height=height,
angle=angle, color='m', alpha=0.5, clip_box = ax.bbox)
ax.add_artist(e)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.title(u'GMM似然函式值', fontsize=20)
plt.grid(True)
plt.show()