模式識別設計(Python程式設計):IRIS資料集的Kmeans聚類與分解聚類法
阿新 • • 發佈:2019-01-04
題目:本次作業的實驗需求是使用分解聚類法與c-means聚類法對IRIS資料集進行聚類,Kmeans聚類程式碼網上摘錄,分解聚類法純原創,PS:因為時間緊,分解聚類法進行第二次分解時,偷懶了~~有緣人改改吧~~
資料格式:
kmeans程式碼:
import math from collections import defaultdict import numpy as np dataname = "data.txt" def loadIRISdata(filename): data = [] with open(filename, mode="r", encoding="utf-8") as rf: for line in rf: if line == '\n': continue data.append(list(map(float, line.split(" ")))) return data def generateCenters(data): '''求解初始聚類中心''' centers = [] '''已知維度為4''' '''分三類,取第0,50,100的三個向量作為分界''' centers.append(data[0]) centers.append(data[50]) centers.append(data[100]) return centers def distance(a ,b): '''歐式距離''' sum = 0 for i in range(4): sq = (a[i]-b[i])*(a[i]-b[i]) sum += sq return math.sqrt(sum) def point_avg(points): '''對維度求平均值''' new_center = [] for i in range(4): sum = 0 for p in points: sum += p[i] new_center.append(float("%.8f" % (sum/float(len(points))))) return new_center def updataCenters(data, assigments): new_means = defaultdict(list) centers = [] for assigment, point in zip(assigments, data): new_means[assigment].append(point) '''將同一類的資料進行整合''' for i in range(3): points = new_means[i] centers.append(point_avg(points)) return centers def assignment(data, centers): assignments = [] '''對應位置顯示對應類群''' for point in data: '''遍歷所有資料''' shortest = float('inf') shortestindex = 0 for i in range(3): '''遍歷三個中心向量,與哪個類中心歐氏距離最短就將其歸為哪類''' value = distance(point, centers[i]) if value < shortest: shortest = value shortestindex = i assignments.append(shortestindex) return assignments def kmeans(data): k_data = generateCenters(data) assigments = assignment(data, k_data) old_assigments = None while assigments != old_assigments: new_centers = updataCenters(data, assigments) old_assigments = assigments assigments = assignment(data, new_centers) result = list(zip(assigments, data)) return result def acc(result): sum = 0 all = 0 for i in range(50): if result[i][0] == 0: sum += 1 all += 1 for i in range(50): if result[i+50][0] == 1: sum += 1 all += 1 for i in range(50): if result[i+100][0] == 2: sum += 1 all += 1 print('sum:', sum, 'all:', all) return sum, all if __name__ == "__main__": data = loadIRISdata(dataname) result = kmeans(data) for i in range(3): tag = 0 print('\n') print("第%d類資料有:" % (i+1)) for tuple in range(len(result)): if(result[tuple][0] == i): print(tuple, end=' ') tag += 1 if tag > 20 : print('\n') tag = 0 #print(result) print('\n') sum, all = acc(result) print('c-means準確度為:%2f%%' % ((sum/all)*100))
kmeans結果:
分解聚類程式碼:
import math from collections import defaultdict import numpy as np dataname = "data.txt" def loadIRISdata(filename): data = [] with open(filename, mode="r", encoding="utf-8") as rf: for line in rf: if line == '\n': continue data.append(list(map(float, line.split(" ")))) return data def E(N, N1, N2, a, b): return float(((N1*N2)/N)*np.matrix((a-b))*np.matrix((a-b)).T) def avg(data, k, assignments): sum = [] tag = 0 for i in range(150): if assignments[i] == k: sum.append(data[i]) tag += 1 return np.sum(sum, 0)/tag def length(k, assignments): answer = 0 for i in range(150): if assignments[i] == k: answer += 1 return answer def decomposition_clustering(data, assignments): Er_max = float('-inf') while True: # 第一次 place = 0 tag = 0 for i in range(150): if assignments[i] == 0: assignments[i] = 1 # print('第%d次迴圈的assignments:' % i, assignments) average_1 = avg(data, 0, assignments) if length(1, assignments) == 0: average_2 = np.array([0, 0, 0, 0]) else: average_2 = avg(data, 1, assignments) Er = E(150, length(0, assignments), length(1, assignments), average_1, average_2) # print('E值為:', Er) if Er > Er_max: place = i Er_max = Er tag = 1 # E未到極值 print('max_1:', Er_max) assignments[i] = 0 if tag == 1: assignments[place] = 1 else: break Er_max = float('-inf') while True: # 第二次 place = 0 tag = 0 for i in range(150): if assignments[i] == 1: assignments[i] = 2 # print('第%d次迴圈的assignments:' % i, assignments) average_1 = avg(data, 1, assignments) if length(2, assignments) == 0: average_2 = np.array([0, 0, 0, 0]) else: average_2 = avg(data, 2, assignments) Er = E(150, length(1, assignments), length(2, assignments), average_1, average_2) # print('E值為:', Er) if Er > Er_max: place = i Er_max = Er tag = 1 # E未到極值 print('max_2:', Er_max) assignments[i] = 1 if tag == 1: assignments[place] = 2 else: break return assignments def acc(result): sum = 0 all = 0 for i in range(50): if result[i][0] == 0: sum += 1 all += 1 for i in range(50): if result[i+50][0] == 1: sum += 1 all += 1 for i in range(50): if result[i+100][0] == 2: sum += 1 all += 1 print('sum:', sum, 'all:', all) return sum, all if __name__ == "__main__": data = loadIRISdata(dataname) assignments = [] for i in range(150): assignments.append(0) answer = decomposition_clustering(data, assignments) result = list(zip(answer, data)) for i in range(3): tag = 0 print('\n') print("第%d類資料有:" % (i+1)) for tuple in range(len(result)): if(result[tuple][0] == i): print(tuple, end=' ') tag += 1 if tag > 20 : print('\n') tag = 0 #print(result) print('\n') sum, all = acc(result) print('分解聚類法準確度為:%2f%%' % ((sum/all)*100))
分解聚類結果: