譜聚類演算法
阿新 • • 發佈:2019-02-11
#include<math.h>
#include<string.h>
#include"matrix.h"
#include"svd.h"
#define N 19 //樣本點個數
#define K 4 //K-Means演算法中的K
#define T 0.1 //樣本點之間相似度的閾值
double sample[N][2]; //存放所有樣本點的座標(2維的)
void readSample(char *filename){
FILE *fp;
if((fp=fopen(filename,"r"))==NULL){
perror("fopen");
exit(0);
}
char buf[50]={0};
int i=0;
while(fgets(buf,sizeof(buf),fp)!=NULL){
char *w=strtok(buf," \t");
double x=atof(w);
w=strtok(NULL," \t");
double y=atof(w);
sample[i][0]=x;
sample[i][1]=y;
i++;
memset(buf,0x00,sizeof(buf));
}
assert(i==N);
fclose(fp);
}
double** getSimMatrix(){
//為二維矩陣申請空間
double **matrix=getMatrix(N,N);
//計算樣本點兩兩之間的相似度,得到矩陣W
int i,j;
for(i=0;i<N;i++){
matrix[i][i]=1;
for(j=i+1;j<N;j++){
double dist=sqrt(pow(sample[i][0]-sample[j][0],2)+pow(sample[i][1]-sample[j][1],2));
double sim=1.0/(1+dist);
if(sim>T){
matrix[j][i]=sim;
matrix[i][j]=sim;
}
}
}
//計算L=D-W
for(j=0;j<N;j++){
double sum=0;
for(i=0;i<N;i++){
sum+=matrix[i][j];
if(i!=j)
matrix[i][j]=0-matrix[i][j];
}
matrix[j][j]=matrix[j][j]-sum;
}
return matrix;
}
int main(){
char *file="/home/orisun/data";
readSample(file);
double **L=getSimMatrix();
printMatrix(L,N,N);
double **M=singleVector(L,N,N,5);
printMatrix(M,N,5);
freeMatrix(L,N);
return 0;
}
L已是對稱矩陣,直接奇異值分解的得到的就是特徵向量
#ifndef _MATRIX_H
#define _MATRIX_H
#include<assert.h>
#include<stdlib.h>
#include<stdio.h>
//初始化一個二維矩陣
double** getMatrix(int rows,int columns){
double **rect=(double**)calloc(rows,sizeof(double*));
int i;
for(i=0;i<rows;++i)
rect[i]=(double*)calloc(columns,sizeof(double));
return rect;
}
//返回一個單位矩陣
double** getIndentityMatrix(int rows){
double** IM=getMatrix(rows,rows);
int i;
for(i=0;i<rows;++i)
IM[i][i]=1.0;
return IM;
}
//返回一個矩陣的副本
double** copyMatrix(double** matrix,int rows,int columns){
double** rect=getMatrix(rows,columns);
int i,j;
for(i=0;i<rows;++i)
for(j=0;j<columns;++j)
rect[i][j]=matrix[i][j];
return rect;
}
//從一個一維矩陣得到一個二維矩陣
void getFromArray(double** matrix,int rows,int columns,double *arr){
int i,j,k=0;
for(i=0;i<rows;++i){
for(j=0;j<columns;++j){
matrix[i][j]=arr[k++];
}
}
}
//列印二維矩陣
void printMatrix(double** matrix,int rows,int columns){
int i,j;
for(i=0;i<rows;++i){
for(j=0;j<columns;++j){
printf("%-10f\t",matrix[i][j]);
}
printf("\n");
}
}
//釋放二維矩陣
void freeMatrix(double** matrix,int rows){
int i;
for(i=0;i<rows;++i)
free(matrix[i]);
free(matrix);
}
//獲取二維矩陣的某一行
double* getRow(double **matrix,int rows,int columns,int index){
assert(index<rows);
double *rect=(double*)calloc(columns,sizeof(double));
int i;
for(i=0;i<columns;++i)
rect[i]=matrix[index][i];
return rect;
}
//獲取二維矩陣的某一列
double* getColumn(double **matrix,int rows,int columns,int index){
assert(index<columns);
double *rect=(double*)calloc(rows,sizeof(double));
int i;
for(i=0;i<rows;++i)
rect[i]=matrix[i][index];
return rect;
}
//設定二維矩陣的某一列
void setColumn(double **matrix,int rows,int columns,int index,double *arr){
assert(index<columns);
int i;
for(i=0;i<rows;++i)
matrix[i][index]=arr[i];
}
//交換矩陣的某兩列
void exchangeColumn(double **matrix,int rows,int columns,int i,int j){
assert(i<columns);
assert(j<columns);
int row;
for(row=0;row<rows;++row){
double tmp=matrix[row][i];
matrix[row][i]=matrix[row][j];
matrix[row][j]=tmp;
}
}
//得到矩陣的轉置
double** getTranspose(double **matrix,int rows,int columns){
double **rect=getMatrix(columns,rows);
int i,j;
for(i=0;i<columns;++i){
for(j=0;j<rows;++j){
rect[i][j]=matrix[j][i];
}
}
return rect;
}
//計算兩向量內積
double vectorProduct(double *vector1,double *vector2,int len){
double rect=0.0;
int i;
for(i=0;i<len;++i)
rect+=vector1[i]*vector2[i];
return rect;
}
//兩個矩陣相乘
double** matrixProduct(double **matrix1,int rows1,int columns1,double **matrix2,int columns2){
double **rect=getMatrix(rows1,columns2);
int i,j;
for(i=0;i<rows1;++i){
for(j=0;j<columns2;++j){
double *vec1=getRow(matrix1,rows1,columns1,i);
double *vec2=getColumn(matrix2,columns1,columns2,j);
rect[i][j]=vectorProduct(vec1,vec2,columns1);
free(vec1);
free(vec2);
}
}
return rect;
}
//得到某一列元素的平方和
double getColumnNorm(double** matrix,int rows,int columns,int index){
assert(index<columns);
double* vector=getColumn(matrix,rows,columns,index);
double norm=vectorProduct(vector,vector,rows);
free(vector);
return norm;
}
//列印向量
void printVector(double* vector,int len){
int i;
for(i=0;i<len;++i)
printf("%-15.8f\t",vector[i]);
printf("\n");
}
#endif
#include"matrix.h"
#define ITERATION 100 //單邊Jacobi最大迭代次數
#define THREASHOLD 0.1
//符號函式
int sign(double number) {
if(number<0)
return -1;
else
return 1;
}
//兩個向量進行單邊Jacobi正交變換
void orthogonalVector(double *Ci,double *Cj,int len1,double *Vi,double *Vj,int len2,int *pass){
double ele=vectorProduct(Ci,Cj,len1);
if(fabs(ele)<THREASHOLD)
return; //如果兩列已經正交,不需要進行變換,則返回true
*pass=0;
double ele1=vectorProduct(Ci,Ci,len1);
double ele2=vectorProduct(Cj,Cj,len1);
double tao=(ele1-ele2)/(2*ele);
double tan=sign(tao)/(fabs(tao)+sqrt(1+pow(tao,2)));
double cos=1/sqrt(1+pow(tan,2));
double sin=cos*tan;
int row;
for(row=0;row<len1;++row){
double var1=Ci[row]*cos+Cj[row]*sin;
double var2=Cj[row]*cos-Ci[row]*sin;
Ci[row]=var1;
Cj[row]=var2;
}
for(row=0;row<len2;++row){
double var1=Vi[row]*cos+Vj[row]*sin;
double var2=Vj[row]*cos-Vi[row]*sin;
Vi[row]=var1;
Vj[row]=var2;
}
}
//矩陣的兩列進行單邊Jacobi正交變換。V是方陣,行/列數為columns
void orthogonal(double **matrix,int rows,int columns,int i,int j,int *pass,double **V){
assert(i<j);
double* Ci=getColumn(matrix,rows,columns,i);
double* Cj=getColumn(matrix,rows,columns,j);
double* Vi=getColumn(V,columns,columns,i);
double* Vj=getColumn(V,columns,columns,j);
orthogonalVector(Ci,Cj,rows,Vi,Vj,columns,pass);
int row;
for(row=0;row<rows;++row){
matrix[row][i]=Ci[row];
matrix[row][j]=Cj[row];
}
for(row=0;row<columns;++row){
V[row][i]=Vi[row];
V[row][j]=Vj[row];
}
free(Ci);
free(Cj);
free(Vi);
free(Vj);
}
//迴圈正交,進行奇異值分解
void hestens_jacobi(double **matrix,int rows,int columns,double **V)
{
int iteration = ITERATION;
while (iteration-- > 0) {
int pass = 1;
int i,j;
for (i = 0; i < columns; ++i) {
for (j = i+1; j < columns; ++j) {
orthogonal(matrix,rows,columns,i,j,&pass,V); //經過多次的迭代正交後,V就求出來了
}
}
if (pass==1) //當任意兩列都正交時退出迭代
break;
}
printf("迭代次數:%d\n",ITERATION - iteration);
}
//獲取矩陣前n小的奇異向量
double **singleVector(double **A,int rows,int columns,int n){
double **V=getIndentityMatrix(columns);
hestens_jacobi(A,rows,columns,V);
double *singular=(double*)calloc(columns,sizeof(double)); //特徵值
int i,j;
for(i=0;i<columns;++i){
double *vector=getColumn(A,rows,columns,i);
double norm=sqrt(vectorProduct(vector,vector,rows));
singular[i]=norm;
}
int *sort=(int*)calloc(columns,sizeof(int));
for(i=0;i<columns;++i)
sort[i]=i;
for(i=0;i<columns-1;++i){
int minIndex=i;
int minValue=singular[i];
for(j=i+1;j<columns;++j){
if(singular[j]<minValue){
minValue=singular[j];
minIndex=j;
}
}
//交換sigular的第i個和第minIndex個元素
singular[minIndex]=singular[i];
singular[i]=minValue;
//交換sort的第i個和第minIndex個元素
int tmp=sort[minIndex];
sort[minIndex]=sort[i];
sort[i]=tmp;
}
double **rect=getMatrix(rows,n);
for(i=0;i<rows;++i){
for(j=0;j<n;++j){
rect[i][j]=V[i][sort[j]];
}
}
freeMatrix(V,columns);
free(sort);
free(singular);
return rect;
}
最後是執行KMeans的Java程式碼
package ai;
public class Global {
//計算兩個向量的歐氏距離
public static double calEuraDist(double[] arr1,double[] arr2,int len){
double result=0.0;
for(int i=0;i<len;i++){
result+=Math.pow(arr1[i]-arr2[i],2.0);
}
return Math.sqrt(result);
}
}
package ai;
public class DataObject {
String docname;
double[] vector;
int cid;
boolean visited;
public DataObject(int len){
vector=new double[len];
}
public String getName() {
return docname;
}
public void setName(String docname) {
this.docname = docname;
}
public double[] getVector() {
return vector;
}
public void setVector(double[] vector) {
this.vector = vector;
}
public int getCid() {
return cid;
}
public void setCid(int cid) {
this.cid = cid;
}
public boolean isVisited() {
return visited;
}
public void setVisited(boolean visited) {
this.visited = visited;
}
}
package ai;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
public class DataSource {
ArrayList<DataObject> objects;
int row;
int col;
public void readMatrix(File dataFile) {
try {
FileReader fr = new FileReader(dataFile);
BufferedReader br = new BufferedReader(fr);
String line = br.readLine();
String[] words = line.split("\\s+");
row = Integer.parseInt(words[0]);
// row=1000;
col = Integer.parseInt(words[1]);
objects = new ArrayList<DataObject>(row);
for (int i = 0; i < row; i++) {
DataObject object = new DataObject(col);
line = br.readLine();
words = line.split("\\s+");
for (int j = 0; j < col; j++) {
object.getVector()[j] = Double.parseDouble(words[j]);
}
objects.add(object);
}
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public void readRLabel(File file) {
try {
FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr);
String line = null;
for (int i = 0; i < row; i++) {
line = br.readLine();
objects.get(i).setName(line.trim());
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void printResult(ArrayList<DataObject> objects, int n) {
//DBScan是從第1類開始,K-Means是從第0類開始
// for (int i =0; i <n; i++) {
for(int i=1;i<=n;i++){
System.out.println("=============屬於第"+i+"類的有:===========================");
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
int cid=object.getCid();
if(cid==i){
System.out.println(object.getName());
// switch(Integer.parseInt(object.getName())/1000){
// case 0:
// System.out.println(0);
// break;
// case 1:
// System.out.println(1);
// break;
// case 2:
// System.out.println(2);
// break;
// case 3:
// System.out.println(3);
// break;
// case 4:
// System.out.println(4);
// break;
// case 5:
// System.out.println(5);
// break;
// default:
// System.out.println("Go Out");
// break;
// }
}
}
}
}
}
package ai;
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
public class KMeans {
int k; // 指定劃分的簇數
double mu; // 迭代終止條件,當各個新質心相對於老質心偏移量小於mu時終止迭代
double[][] center; // 上一次各簇質心的位置
int repeat; // 重複執行次數
double[] crita; // 存放每次執行的滿意度
public KMeans(int k, double mu, int repeat, int len) {
this.k = k;
this.mu = mu;
this.repeat = repeat;
center = new double[k][];
for (int i = 0; i < k; i++)
center[i] = new double[len];
crita = new double[repeat];
}
// 初始化k個質心,每個質心是len維的向量,每維均在left--right之間
public void initCenter(int len, ArrayList<DataObject> objects) {
Random random = new Random(System.currentTimeMillis());
int[] count = new int[k]; // 記錄每個簇有多少個元素
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
int id = random.nextInt(10000)%k;
count[id]++;
for (int i = 0; i < len; i++)
center[id][i] += object.getVector()[i];
}
for (int i = 0; i < k; i++) {
for (int j = 0; j < len; j++) {
center[i][j] /= count[i];
}
}
}
// 把資料集中的每個點歸到離它最近的那個質心
public void classify(ArrayList<DataObject> objects) {
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
double[] vector = object.getVector();
int len = vector.length;
int index = 0;
double neardist = Double.MAX_VALUE;
for (int i = 0; i < k; i++) {
double dist = Global.calEuraDist(vector, center[i], len); // 使用歐氏距離
if (dist < neardist) {
neardist = dist;
index = i;
}
}
object.setCid(index);
}
}
// 重新計算每個簇的質心,並判斷終止條件是否滿足,如果不滿足更新各簇的質心,如果滿足就返回true.len是資料的維數
public boolean calNewCenter(ArrayList<DataObject> objects, int len) {
boolean end = true;
int[] count = new int[k]; // 記錄每個簇有多少個元素
double[][] sum = new double[k][];
for (int i = 0; i < k; i++)
sum[i] = new double[len];
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
int id = object.getCid();
count[id]++;
for (int i = 0; i < len; i++)
sum[id][i] += object.getVector()[i];
}
for (int i = 0; i < k; i++) {
if (count[i] != 0) {
for (int j = 0; j < len; j++) {
sum[i][j] /= count[i];
}
}
// 簇中不包含任何點,及時調整質心
else {
int a=(i+1)%k;
int b=(i+3)%k;
int c=(i+5)%k;
for (int j = 0; j < len; j++) {
center[i][j] = (center[a][j]+center[b][j]+center[c][j])/3;
}
}
}
for (int i = 0; i < k; i++) {
// 只要有一個質心需要移動的距離超過了mu,就返回false
if (Global.calEuraDist(sum[i], center[i], len) >= mu) {
end = false;
break;
}
}
if (!end) {
for (int i = 0; i < k; i++) {
for (int j = 0; j < len; j++)
center[i][j] = sum[i][j];
}
}
return end;
}
// 計算各簇內資料和方差的加權平均,得出本次聚類的滿意度.len是資料的維數
public double getSati(ArrayList<DataObject> objects, int len) {
double satisfy = 0.0;
int[] count = new int[k];
double[] ss = new double[k];
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
int id = object.getCid();
count[id]++;
for (int i = 0; i < len; i++)
ss[id] += Math.pow(object.getVector()[i] - center[id][i], 2.0);
}
for (int i = 0; i < k; i++) {
satisfy += count[i] * ss[i];
}
return satisfy;
}
public double run(int round, DataSource datasource, int len) {
System.out.println("第" + round + "次執行");
initCenter(len,datasource.objects);
classify(datasource.objects);
while (!calNewCenter(datasource.objects, len)) {
classify(datasource.objects);
}
datasource.printResult(datasource.objects, k);
double ss = getSati(datasource.objects, len);
System.out.println("加權方差:" + ss);
return ss;
}
public static void main(String[] args) {
DataSource datasource = new DataSource();
datasource.readMatrix(new File("/home/orisun/test/dot.mat"));
datasource.readRLabel(new File("/home/orisun/test/dot.rlabel"));
int len = datasource.col;
// 劃分為4個簇,質心移動小於1E-8時終止迭代,重複執行7次
KMeans km = new KMeans(4, 1E-10, 7, len);
int index = 0;
double minsa = Double.MAX_VALUE;
for (int i = 0; i < km.repeat; i++) {
double ss = km.run(i, datasource, len);
if (ss < minsa) {
minsa = ss;
index = i;
}
}
System.out.println("最好的結果是第" + index + "次。");
}
}
#include<string.h>
#include"matrix.h"
#include"svd.h"
#define N 19 //樣本點個數
#define K 4 //K-Means演算法中的K
#define T 0.1 //樣本點之間相似度的閾值
double sample[N][2]; //存放所有樣本點的座標(2維的)
void readSample(char *filename){
FILE *fp;
if((fp=fopen(filename,"r"))==NULL){
perror("fopen");
exit(0);
}
char buf[50]={0};
int i=0;
while(fgets(buf,sizeof(buf),fp)!=NULL){
char *w=strtok(buf," \t");
double x=atof(w);
w=strtok(NULL," \t");
double y=atof(w);
sample[i][0]=x;
sample[i][1]=y;
i++;
memset(buf,0x00,sizeof(buf));
}
assert(i==N);
fclose(fp);
}
double** getSimMatrix(){
//為二維矩陣申請空間
double **matrix=getMatrix(N,N);
//計算樣本點兩兩之間的相似度,得到矩陣W
int i,j;
for(i=0;i<N;i++){
matrix[i][i]=1;
for(j=i+1;j<N;j++){
double dist=sqrt(pow(sample[i][0]-sample[j][0],2)+pow(sample[i][1]-sample[j][1],2));
double sim=1.0/(1+dist);
if(sim>T){
matrix[j][i]=sim;
matrix[i][j]=sim;
}
}
}
//計算L=D-W
for(j=0;j<N;j++){
double sum=0;
for(i=0;i<N;i++){
sum+=matrix[i][j];
if(i!=j)
matrix[i][j]=0-matrix[i][j];
}
matrix[j][j]=matrix[j][j]-sum;
}
return matrix;
}
int main(){
char *file="/home/orisun/data";
readSample(file);
double **L=getSimMatrix();
printMatrix(L,N,N);
double **M=singleVector(L,N,N,5);
printMatrix(M,N,5);
freeMatrix(L,N);
return 0;
}
L已是對稱矩陣,直接奇異值分解的得到的就是特徵向量
#ifndef _MATRIX_H
#define _MATRIX_H
#include<assert.h>
#include<stdlib.h>
#include<stdio.h>
//初始化一個二維矩陣
double** getMatrix(int rows,int columns){
double **rect=(double**)calloc(rows,sizeof(double*));
int i;
for(i=0;i<rows;++i)
rect[i]=(double*)calloc(columns,sizeof(double));
return rect;
}
//返回一個單位矩陣
double** getIndentityMatrix(int rows){
double** IM=getMatrix(rows,rows);
int i;
for(i=0;i<rows;++i)
IM[i][i]=1.0;
return IM;
}
//返回一個矩陣的副本
double** copyMatrix(double** matrix,int rows,int columns){
double** rect=getMatrix(rows,columns);
int i,j;
for(i=0;i<rows;++i)
for(j=0;j<columns;++j)
rect[i][j]=matrix[i][j];
return rect;
}
//從一個一維矩陣得到一個二維矩陣
void getFromArray(double** matrix,int rows,int columns,double *arr){
int i,j,k=0;
for(i=0;i<rows;++i){
for(j=0;j<columns;++j){
matrix[i][j]=arr[k++];
}
}
}
//列印二維矩陣
void printMatrix(double** matrix,int rows,int columns){
int i,j;
for(i=0;i<rows;++i){
for(j=0;j<columns;++j){
printf("%-10f\t",matrix[i][j]);
}
printf("\n");
}
}
//釋放二維矩陣
void freeMatrix(double** matrix,int rows){
int i;
for(i=0;i<rows;++i)
free(matrix[i]);
free(matrix);
}
//獲取二維矩陣的某一行
double* getRow(double **matrix,int rows,int columns,int index){
assert(index<rows);
double *rect=(double*)calloc(columns,sizeof(double));
int i;
for(i=0;i<columns;++i)
rect[i]=matrix[index][i];
return rect;
}
//獲取二維矩陣的某一列
double* getColumn(double **matrix,int rows,int columns,int index){
assert(index<columns);
double *rect=(double*)calloc(rows,sizeof(double));
int i;
for(i=0;i<rows;++i)
rect[i]=matrix[i][index];
return rect;
}
//設定二維矩陣的某一列
void setColumn(double **matrix,int rows,int columns,int index,double *arr){
assert(index<columns);
int i;
for(i=0;i<rows;++i)
matrix[i][index]=arr[i];
}
//交換矩陣的某兩列
void exchangeColumn(double **matrix,int rows,int columns,int i,int j){
assert(i<columns);
assert(j<columns);
int row;
for(row=0;row<rows;++row){
double tmp=matrix[row][i];
matrix[row][i]=matrix[row][j];
matrix[row][j]=tmp;
}
}
//得到矩陣的轉置
double** getTranspose(double **matrix,int rows,int columns){
double **rect=getMatrix(columns,rows);
int i,j;
for(i=0;i<columns;++i){
for(j=0;j<rows;++j){
rect[i][j]=matrix[j][i];
}
}
return rect;
}
//計算兩向量內積
double vectorProduct(double *vector1,double *vector2,int len){
double rect=0.0;
int i;
for(i=0;i<len;++i)
rect+=vector1[i]*vector2[i];
return rect;
}
//兩個矩陣相乘
double** matrixProduct(double **matrix1,int rows1,int columns1,double **matrix2,int columns2){
double **rect=getMatrix(rows1,columns2);
int i,j;
for(i=0;i<rows1;++i){
for(j=0;j<columns2;++j){
double *vec1=getRow(matrix1,rows1,columns1,i);
double *vec2=getColumn(matrix2,columns1,columns2,j);
rect[i][j]=vectorProduct(vec1,vec2,columns1);
free(vec1);
free(vec2);
}
}
return rect;
}
//得到某一列元素的平方和
double getColumnNorm(double** matrix,int rows,int columns,int index){
assert(index<columns);
double* vector=getColumn(matrix,rows,columns,index);
double norm=vectorProduct(vector,vector,rows);
free(vector);
return norm;
}
//列印向量
void printVector(double* vector,int len){
int i;
for(i=0;i<len;++i)
printf("%-15.8f\t",vector[i]);
printf("\n");
}
#endif
#include"matrix.h"
#define ITERATION 100 //單邊Jacobi最大迭代次數
#define THREASHOLD 0.1
//符號函式
int sign(double number) {
if(number<0)
return -1;
else
return 1;
}
//兩個向量進行單邊Jacobi正交變換
void orthogonalVector(double *Ci,double *Cj,int len1,double *Vi,double *Vj,int len2,int *pass){
double ele=vectorProduct(Ci,Cj,len1);
if(fabs(ele)<THREASHOLD)
return; //如果兩列已經正交,不需要進行變換,則返回true
*pass=0;
double ele1=vectorProduct(Ci,Ci,len1);
double ele2=vectorProduct(Cj,Cj,len1);
double tao=(ele1-ele2)/(2*ele);
double tan=sign(tao)/(fabs(tao)+sqrt(1+pow(tao,2)));
double cos=1/sqrt(1+pow(tan,2));
double sin=cos*tan;
int row;
for(row=0;row<len1;++row){
double var1=Ci[row]*cos+Cj[row]*sin;
double var2=Cj[row]*cos-Ci[row]*sin;
Ci[row]=var1;
Cj[row]=var2;
}
for(row=0;row<len2;++row){
double var1=Vi[row]*cos+Vj[row]*sin;
double var2=Vj[row]*cos-Vi[row]*sin;
Vi[row]=var1;
Vj[row]=var2;
}
}
//矩陣的兩列進行單邊Jacobi正交變換。V是方陣,行/列數為columns
void orthogonal(double **matrix,int rows,int columns,int i,int j,int *pass,double **V){
assert(i<j);
double* Ci=getColumn(matrix,rows,columns,i);
double* Cj=getColumn(matrix,rows,columns,j);
double* Vi=getColumn(V,columns,columns,i);
double* Vj=getColumn(V,columns,columns,j);
orthogonalVector(Ci,Cj,rows,Vi,Vj,columns,pass);
int row;
for(row=0;row<rows;++row){
matrix[row][i]=Ci[row];
matrix[row][j]=Cj[row];
}
for(row=0;row<columns;++row){
V[row][i]=Vi[row];
V[row][j]=Vj[row];
}
free(Ci);
free(Cj);
free(Vi);
free(Vj);
}
//迴圈正交,進行奇異值分解
void hestens_jacobi(double **matrix,int rows,int columns,double **V)
{
int iteration = ITERATION;
while (iteration-- > 0) {
int pass = 1;
int i,j;
for (i = 0; i < columns; ++i) {
for (j = i+1; j < columns; ++j) {
orthogonal(matrix,rows,columns,i,j,&pass,V); //經過多次的迭代正交後,V就求出來了
}
}
if (pass==1) //當任意兩列都正交時退出迭代
break;
}
printf("迭代次數:%d\n",ITERATION - iteration);
}
//獲取矩陣前n小的奇異向量
double **singleVector(double **A,int rows,int columns,int n){
double **V=getIndentityMatrix(columns);
hestens_jacobi(A,rows,columns,V);
double *singular=(double*)calloc(columns,sizeof(double)); //特徵值
int i,j;
for(i=0;i<columns;++i){
double *vector=getColumn(A,rows,columns,i);
double norm=sqrt(vectorProduct(vector,vector,rows));
singular[i]=norm;
}
int *sort=(int*)calloc(columns,sizeof(int));
for(i=0;i<columns;++i)
sort[i]=i;
for(i=0;i<columns-1;++i){
int minIndex=i;
int minValue=singular[i];
for(j=i+1;j<columns;++j){
if(singular[j]<minValue){
minValue=singular[j];
minIndex=j;
}
}
//交換sigular的第i個和第minIndex個元素
singular[minIndex]=singular[i];
singular[i]=minValue;
//交換sort的第i個和第minIndex個元素
int tmp=sort[minIndex];
sort[minIndex]=sort[i];
sort[i]=tmp;
}
double **rect=getMatrix(rows,n);
for(i=0;i<rows;++i){
for(j=0;j<n;++j){
rect[i][j]=V[i][sort[j]];
}
}
freeMatrix(V,columns);
free(sort);
free(singular);
return rect;
}
最後是執行KMeans的Java程式碼
package ai;
public class Global {
//計算兩個向量的歐氏距離
public static double calEuraDist(double[] arr1,double[] arr2,int len){
double result=0.0;
for(int i=0;i<len;i++){
result+=Math.pow(arr1[i]-arr2[i],2.0);
}
return Math.sqrt(result);
}
}
package ai;
public class DataObject {
String docname;
double[] vector;
int cid;
boolean visited;
public DataObject(int len){
vector=new double[len];
}
public String getName() {
return docname;
}
public void setName(String docname) {
this.docname = docname;
}
public double[] getVector() {
return vector;
}
public void setVector(double[] vector) {
this.vector = vector;
}
public int getCid() {
return cid;
}
public void setCid(int cid) {
this.cid = cid;
}
public boolean isVisited() {
return visited;
}
public void setVisited(boolean visited) {
this.visited = visited;
}
}
package ai;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
public class DataSource {
ArrayList<DataObject> objects;
int row;
int col;
public void readMatrix(File dataFile) {
try {
FileReader fr = new FileReader(dataFile);
BufferedReader br = new BufferedReader(fr);
String line = br.readLine();
String[] words = line.split("\\s+");
row = Integer.parseInt(words[0]);
// row=1000;
col = Integer.parseInt(words[1]);
objects = new ArrayList<DataObject>(row);
for (int i = 0; i < row; i++) {
DataObject object = new DataObject(col);
line = br.readLine();
words = line.split("\\s+");
for (int j = 0; j < col; j++) {
object.getVector()[j] = Double.parseDouble(words[j]);
}
objects.add(object);
}
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public void readRLabel(File file) {
try {
FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr);
String line = null;
for (int i = 0; i < row; i++) {
line = br.readLine();
objects.get(i).setName(line.trim());
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void printResult(ArrayList<DataObject> objects, int n) {
//DBScan是從第1類開始,K-Means是從第0類開始
// for (int i =0; i <n; i++) {
for(int i=1;i<=n;i++){
System.out.println("=============屬於第"+i+"類的有:===========================");
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
int cid=object.getCid();
if(cid==i){
System.out.println(object.getName());
// switch(Integer.parseInt(object.getName())/1000){
// case 0:
// System.out.println(0);
// break;
// case 1:
// System.out.println(1);
// break;
// case 2:
// System.out.println(2);
// break;
// case 3:
// System.out.println(3);
// break;
// case 4:
// System.out.println(4);
// break;
// case 5:
// System.out.println(5);
// break;
// default:
// System.out.println("Go Out");
// break;
// }
}
}
}
}
}
package ai;
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
public class KMeans {
int k; // 指定劃分的簇數
double mu; // 迭代終止條件,當各個新質心相對於老質心偏移量小於mu時終止迭代
double[][] center; // 上一次各簇質心的位置
int repeat; // 重複執行次數
double[] crita; // 存放每次執行的滿意度
public KMeans(int k, double mu, int repeat, int len) {
this.k = k;
this.mu = mu;
this.repeat = repeat;
center = new double[k][];
for (int i = 0; i < k; i++)
center[i] = new double[len];
crita = new double[repeat];
}
// 初始化k個質心,每個質心是len維的向量,每維均在left--right之間
public void initCenter(int len, ArrayList<DataObject> objects) {
Random random = new Random(System.currentTimeMillis());
int[] count = new int[k]; // 記錄每個簇有多少個元素
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
int id = random.nextInt(10000)%k;
count[id]++;
for (int i = 0; i < len; i++)
center[id][i] += object.getVector()[i];
}
for (int i = 0; i < k; i++) {
for (int j = 0; j < len; j++) {
center[i][j] /= count[i];
}
}
}
// 把資料集中的每個點歸到離它最近的那個質心
public void classify(ArrayList<DataObject> objects) {
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
double[] vector = object.getVector();
int len = vector.length;
int index = 0;
double neardist = Double.MAX_VALUE;
for (int i = 0; i < k; i++) {
double dist = Global.calEuraDist(vector, center[i], len); // 使用歐氏距離
if (dist < neardist) {
neardist = dist;
index = i;
}
}
object.setCid(index);
}
}
// 重新計算每個簇的質心,並判斷終止條件是否滿足,如果不滿足更新各簇的質心,如果滿足就返回true.len是資料的維數
public boolean calNewCenter(ArrayList<DataObject> objects, int len) {
boolean end = true;
int[] count = new int[k]; // 記錄每個簇有多少個元素
double[][] sum = new double[k][];
for (int i = 0; i < k; i++)
sum[i] = new double[len];
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
int id = object.getCid();
count[id]++;
for (int i = 0; i < len; i++)
sum[id][i] += object.getVector()[i];
}
for (int i = 0; i < k; i++) {
if (count[i] != 0) {
for (int j = 0; j < len; j++) {
sum[i][j] /= count[i];
}
}
// 簇中不包含任何點,及時調整質心
else {
int a=(i+1)%k;
int b=(i+3)%k;
int c=(i+5)%k;
for (int j = 0; j < len; j++) {
center[i][j] = (center[a][j]+center[b][j]+center[c][j])/3;
}
}
}
for (int i = 0; i < k; i++) {
// 只要有一個質心需要移動的距離超過了mu,就返回false
if (Global.calEuraDist(sum[i], center[i], len) >= mu) {
end = false;
break;
}
}
if (!end) {
for (int i = 0; i < k; i++) {
for (int j = 0; j < len; j++)
center[i][j] = sum[i][j];
}
}
return end;
}
// 計算各簇內資料和方差的加權平均,得出本次聚類的滿意度.len是資料的維數
public double getSati(ArrayList<DataObject> objects, int len) {
double satisfy = 0.0;
int[] count = new int[k];
double[] ss = new double[k];
Iterator<DataObject> iter = objects.iterator();
while (iter.hasNext()) {
DataObject object = iter.next();
int id = object.getCid();
count[id]++;
for (int i = 0; i < len; i++)
ss[id] += Math.pow(object.getVector()[i] - center[id][i], 2.0);
}
for (int i = 0; i < k; i++) {
satisfy += count[i] * ss[i];
}
return satisfy;
}
public double run(int round, DataSource datasource, int len) {
System.out.println("第" + round + "次執行");
initCenter(len,datasource.objects);
classify(datasource.objects);
while (!calNewCenter(datasource.objects, len)) {
classify(datasource.objects);
}
datasource.printResult(datasource.objects, k);
double ss = getSati(datasource.objects, len);
System.out.println("加權方差:" + ss);
return ss;
}
public static void main(String[] args) {
DataSource datasource = new DataSource();
datasource.readMatrix(new File("/home/orisun/test/dot.mat"));
datasource.readRLabel(new File("/home/orisun/test/dot.rlabel"));
int len = datasource.col;
// 劃分為4個簇,質心移動小於1E-8時終止迭代,重複執行7次
KMeans km = new KMeans(4, 1E-10, 7, len);
int index = 0;
double minsa = Double.MAX_VALUE;
for (int i = 0; i < km.repeat; i++) {
double ss = km.run(i, datasource, len);
if (ss < minsa) {
minsa = ss;
index = i;
}
}
System.out.println("最好的結果是第" + index + "次。");
}
}