1. 程式人生 > >聚類 K-means & K-medoids 演算法

聚類 K-means & K-medoids 演算法

關於K-means和K-medoids的描述,參見pluskid部落格http://blog.pluskid.org/?tag=clustering或http://blog.csdn.net/abcjennifer/article/details/8197072

下面給出首先給出matlab關於K-means的matlab程式碼:

function [labels,Cnt] = kmeans(k,D,threshold=1e-10)
%KMEANS Summary of this function goes here
    %Detailed explanation goes here
    N=length(D);
    R_I = randperm(N,k);  
    Cnt = D(R_I,:);
    %k Random cluster centre;
    labels=zeros(N,1);
    while(true)
        dist=zeros(k,1);
        for l=1:N
            for i=1:k
               dist(i)=norm(D(l,:)-Cnt(i,:));
            end
            [~,t]=min(dist);
            labels(l)=t;
        end
        sum=zeros(k,2);
        cont=zeros(k,1);
        for l=1:N
            sum(labels(l),:)=sum(labels(l),:)+D(l,:);
            cont(labels(l),:)=cont(labels(l),:)+1;
        end
        for i=1:k
            sum(i,:)=sum(i,:)/cont(i,:);
        end
        %average, and obtain new centres;
        if norm(Cnt-sum)<threshold
            break;
        else
            Cnt=sum;
        end
    end
end
實驗的資料採用三個高斯分佈生成
% generate out Gaussian distribution samples;
mu=[0,-15];
sigma=[45 ,0;0,45];
r1=mvnrnd(mu,sigma,300);
mu=[5,15];
sigma=[15 ,0;0,15];
r2=mvnrnd(mu,sigma,300);
mu=[-5,7];
sigma=[15,0;0,15];
r3=mvnrnd(mu,sigma,300);
figure;
plot(r1(:,1),r1(:,2),'r*',r2(:,1),r2(:,2),'b*',r3(:,1),r3(:,2),'g*');
title('the generating data');
D=[r1;r2;r3]

medoids演算法要求計算centres的值在已有的資料點中,這樣提高了魯棒性,因此需要計算每一個點在該類中的距離:

function [labels,Cnt] = kmedoids(k,D,threshold)
%KMEDOIDS Summary of this function goes here
%   Detailed explanation goes here
    N=length(D);
    R_I = randperm(N,k);  
    Cnt = D(R_I,:);
    %k Random cluster centre;
    labels=zeros(N,1);
    while(true)
        dist=zeros(k,1);
        for l=1:N
            for i=1:k
               dist(i)=norm(D(l,:)-Cnt(i,:));
            end
            [~,t]=min(dist);
            labels(l)=t;
        end
        dist_mat=cell(k,1);
        for s=1:k
            dist_mat{s}=zeros(N,N);
        end
        for l=1:N
            for p=l+1:N
                if labels(l)~=labels(p)
                    continue;
                else
                    dist_mat{labels(l)}(l,p)=norm(D(p,:)-D(l,:));
                    dist_mat{labels(l)}(p,l)=dist_mat{labels(l)}(l,p);
                end
            end
        end
        Cnt_=D(R_I,:);
        for s=1:k
            temp=sum(dist_mat{s},1,'double');
            [~,t]=min(temp);
            minimal=realmax;
            for l=1:N
                if (minimal > temp(l)) & (labels(l)==s)
                    minimal=temp(l);
                    Cnt_(s,:)=D(l,:);
                end
            end
        end
        %average, and obtain new centres;
        if norm(Cnt-Cnt_)<threshold
            break;
        else
            Cnt=Cnt_;
        end
    end
end