kMeans聚類的python實現
阿新 • • 發佈:2018-12-31
from numpy import *
import matplotlib.pyplot as plt
#輔助函式
#載入資料集
def loadDataSet(filename):
dataMat = []
f = open(filename)
for line in f.readlines():
curLine = line.strip().split('\t')
#python3.之後需要用list(map())
fltLine = list(map(float,curLine))
dataMat.append(fltLine)
return dataMat
#返回兩個點的歐氏距離
def distEclud(vecA,vecB):
return sqrt(sum(power(vecA-vecB,2)))
#構建一個包含k個隨機質心的集合
def randCent(dataSet,k):
#獲取每一位維的度數
n = shape(dataSet)[1]
#生成(k,n)維空陣列矩陣
centroids = mat(zeros((k,n)))
#在minJ到maxJ之間生成隨機質心填充 centroids
for j in range(n):
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = minJ + rangeJ * random.rand(k,1 )
return centroids
def kMeans(dataSet,k,dist = distEclud, createCent = randCent):
m = shape(dataSet)[0]
#長度為m的label陣列
label = zeros((1,m))[0]
centroids = createCent(dataSet,k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
#為每個點尋找最近的質心
for i in range(m):
minDist = inf; minIndex = -1;
for j in range(k):
distJI = dist(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI;minIndex = j
if(label[i] != minIndex):
clusterChanged = True
label[i] = minIndex
print(centroids)
#重新計算質心的位置
for cent in range(k):
ptsInclust = dataSet[nonzero(label == cent)[0]]
centroids[cent,:] = mean(ptsInclust,axis = 0)
return centroids , label
if __name__ == '__main__':
k = 4
filename = 'testSet.txt'
dataSet = loadDataSet(filename)
dataArray = array(dataSet)
#dataMat = mat(loadDataSet(filename))
#plt.plot(dataArray[:,0],dataArray[:,1],'o')
centroids,label = kMeans(dataArray,k)
str = 'o*s^'
color = 'bgrc'
for i in range(len(label)):
ch = str[int(label[i])]
co = color[int(label[i])]
plt.plot(dataArray[i,0],dataArray[i,1],color =co ,marker = ch)
for i in range(len(centroids)):
plt.plot(centroids[i,0],centroids[i,1],'k+')