文字聚類——Kmeans
阿新 • • 發佈:2019-01-08
上兩篇文章分別用樸素貝葉斯演算法和KNN演算法對newgroup文字進行了分類測試,本文使用Kmeans演算法對文字進行聚類。
1、文字預處理
文字預處理在前面兩本文章中已經介紹,此處(略)。
2、文字向量化
package com.datamine.kmeans; import java.io.*; import java.util.*; import java.util.Map.Entry; /** * 計算文件的屬性向量,將所有文件向量化 * @author Administrator */ public class ComputeWordsVector { /** * 計算文件的TF-IDF屬性向量,返回Map<檔名,<特徵詞,TF-IDF值>> * @param testSampleDir 處理好的聚類樣本測試樣例集 * @return 所有測試樣例的屬性向量構成的map * @throws IOException */ public Map<String,Map<String,Double>> computeTFMultiIDF(String testSampleDir) throws IOException{ String word; Map<String,Map<String,Double>> allTestSampleMap = new TreeMap<String, Map<String,Double>>(); Map<String,Double> idfPerWordMap = computeIDF(testSampleDir); Map<String,Double> tfPerDocMap = new TreeMap<String, Double>(); File[] samples = new File(testSampleDir).listFiles(); System.out.println("the total number of test files is " + samples.length); for(int i = 0;i<samples.length;i++){ tfPerDocMap.clear(); FileReader samReader = new FileReader(samples[i]); BufferedReader samBR = new BufferedReader(samReader); Double wordSumPerDoc = 0.0; //計算每篇文件的總詞數 while((word = samBR.readLine()) != null){ if(!word.isEmpty()){ wordSumPerDoc++; if(tfPerDocMap.containsKey(word)) tfPerDocMap.put(word, tfPerDocMap.get(word)+1.0); else tfPerDocMap.put(word, 1.0); } } Double maxCount = 0.0,wordWeight; //記錄出現次數最多的詞的次數,用作歸一化 ??? Set<Map.Entry<String, Double>> tempTF = tfPerDocMap.entrySet(); for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){ Map.Entry<String, Double> me = mt.next(); if(me.getValue() > maxCount) maxCount = me.getValue(); } for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){ Map.Entry<String, Double> me = mt.next(); Double IDF = Math.log(samples.length / idfPerWordMap.get(me.getKey())); wordWeight = (me.getValue() / wordSumPerDoc) * IDF; tfPerDocMap.put(me.getKey(), wordWeight); } TreeMap<String,Double> tempMap = new TreeMap<String, Double>(); tempMap.putAll(tfPerDocMap); allTestSampleMap.put(samples[i].getName(), tempMap); } printTestSampleMap(allTestSampleMap); return allTestSampleMap; } /** * 輸出測試樣例map內容,用於測試 * @param allTestSampleMap * @throws IOException */ private void printTestSampleMap( Map<String, Map<String, Double>> allTestSampleMap) throws IOException { // TODO Auto-generated method stub File outPutFile = new File("E:/DataMiningSample/KmeansClusterResult/allTestSampleMap.txt"); FileWriter outPutFileWriter = new FileWriter(outPutFile); Set<Map.Entry<String, Map<String,Double>>> allWords = allTestSampleMap.entrySet(); for(Iterator<Entry<String, Map<String, Double>>> it = allWords.iterator();it.hasNext();){ Map.Entry<String, Map<String,Double>> me = it.next(); outPutFileWriter.append(me.getKey()+" "); Set<Map.Entry<String, Double>> vectorSet = me.getValue().entrySet(); for(Iterator<Map.Entry<String, Double>> vt = vectorSet.iterator();vt.hasNext();){ Map.Entry<String, Double> vme = vt.next(); outPutFileWriter.append(vme.getKey()+" "+vme.getValue()+" "); } outPutFileWriter.append("\n"); outPutFileWriter.flush(); } outPutFileWriter.close(); } /** * 統計每個詞的總出現次數,返回出現次數大於n次的詞彙構成最終的屬性詞典 * @param strDir 處理好的newsgroup檔案目錄的絕對路徑 * @param wordMap 記錄出現的每個詞構成的屬性詞典 * @return newWordMap 返回出現次數大於n次的詞彙構成最終的屬性詞典 * @throws IOException */ public SortedMap<String, Double> countWords(String strDir, Map<String, Double> wordMap) throws IOException { File sampleFile = new File(strDir); File[] sample = sampleFile.listFiles(); String word; for(int i =0 ;i < sample.length;i++){ if(!sample[i].isDirectory()){ FileReader samReader = new FileReader(sample[i]); BufferedReader samBR = new BufferedReader(samReader); while((word = samBR.readLine()) != null){ if(!word.isEmpty() && wordMap.containsKey(word)) wordMap.put(word, wordMap.get(word)+1); else wordMap.put(word, 1.0); } samBR.close(); }else{ countWords(sample[i].getCanonicalPath(),wordMap); } } /* * 去除停頓詞後,先用DF演算法選取特徵詞,後面再加入特徵詞的選取演算法 */ SortedMap<String,Double> newWordMap = new TreeMap<String, Double>(); Set<Map.Entry<String, Double>> allWords = wordMap.entrySet(); for(Iterator<Map.Entry<String, Double>> it = allWords.iterator();it.hasNext();){ Map.Entry<String, Double> me = it.next(); if(me.getValue() > 100) //DF演算法降維 newWordMap.put(me.getKey(), me.getValue()); } return newWordMap; } /** * 計算IDF,即屬性詞典中每個詞在多少個文件中出現過 * @param testSampleDir 聚類演算法測試樣本所在的目錄 * @return 單詞IDFmap <單詞,包含該單詞的文件數> * @throws IOException */ public Map<String,Double> computeIDF(String testSampleDir) throws IOException{ Map<String,Double> IDFPerWordMap = new TreeMap<String, Double>(); //記下當前已經遇到過的該文件中的詞 Set<String> alreadyCountWord = new HashSet<String>(); String word; File[] samples = new File(testSampleDir).listFiles(); for(int i = 0;i<samples.length;i++){ alreadyCountWord.clear(); FileReader tsReader = new FileReader(samples[i]); BufferedReader tsBR = new BufferedReader(tsReader); while((word = tsBR.readLine()) != null){ if(!alreadyCountWord.contains(word)){ if(IDFPerWordMap.containsKey(word)) IDFPerWordMap.put(word, IDFPerWordMap.get(word)+1.0); else IDFPerWordMap.put(word, 1.0); alreadyCountWord.add(word); } } } return IDFPerWordMap; } /** * 建立聚類演算法的測試樣例集,主要是過濾出只含有特徵詞的文件寫到一個目錄下 * @param srcDir 源目錄,已經預處理但是還沒有過濾非特徵詞的文件目錄 * @param desDir 目的目錄,聚類演算法的測試樣例目錄 * @return 建立測試樣例集中特徵詞陣列 * @throws IOException */ public String[] createTestSamples(String srcDir, String desDir) throws IOException { SortedMap<String,Double> wordMap = new TreeMap<String, Double>(); wordMap = countWords(srcDir,wordMap); System.out.println("special words map sizes:" + wordMap.size()); String word,testSampleFile; File[] sampleDir = new File(srcDir).listFiles(); for(int i =0;i<sampleDir.length;i++){ File[] sample = sampleDir[i].listFiles(); for(int j =0;j<sample.length;j++){ testSampleFile = desDir + sampleDir[i].getName()+"_"+sample[j].getName(); FileReader samReader = new FileReader(sample[j]); BufferedReader samBR = new BufferedReader(samReader); FileWriter tsWriter = new FileWriter(new File(testSampleFile)); while((word = samBR.readLine()) != null){ if(wordMap.containsKey(word)) tsWriter.append(word + "\n"); } tsWriter.flush(); tsWriter.close(); } } //返回屬性詞典 String[] terms = new String[wordMap.size()]; int i = 0; Set<Map.Entry<String, Double>> allWords = wordMap.entrySet(); for(Iterator<Map.Entry<String, Double>> it = allWords.iterator();it.hasNext();){ Map.Entry<String, Double> me = it.next(); terms[i] = me.getKey(); i++; } return terms; } }
3、Kmeans演算法
Kmeans演算法是非常經典的聚類演算法,演算法主要步驟如下:先選K個(或者隨機選擇)初始聚類點作為初始中心點,然後就算其他所有點到K個聚類中心點的距離,將點分到最近的聚類中。聚類完後,再次計算各個類中的中心點,中心點發生變化,於是更新中心點,然後再計算其他點到中心點的距離重新聚類,中心點又發生變化,如此迭代下去。
初始點選取策略:隨機選,均勻抽樣,最大最小法等....
距離的度量方法:1-餘弦相似度,2-向量內積
演算法停止條件:計算準則函式及設定最大迭代次數
空聚類的處理:注意空聚類導致的程式bug
package com.datamine.kmeans; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.*; /** * kmeans聚類演算法的實現類,將newsgroup文件集聚成10類、20類、30類 * 演算法結束條件:當每個點最近的聚類中心點就是它所屬的聚類中心點時,演算法結束 * @author Administrator * */ public class KmeansCluster { /** * kmeans演算法主過程 * @param allTestSampleMap 聚類演算法測試樣本map(已經向量化) <檔名,<特徵詞,TF-IDF值>> * @param k 聚類的數量 * @return 聚類結果 <檔名,聚類完成後所屬的類別號> */ private Map<String, Integer> doProcess( Map<String, Map<String, Double>> allTestSampleMap, int k) { //0、首先獲取allTestSampleMap所有檔名順序組成的陣列 String[] testSampleNames = new String[allTestSampleMap.size()]; int count =0,tsLength = allTestSampleMap.size(); Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet(); for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){ Map.Entry<String, Map<String,Double>> me = it.next(); testSampleNames[count++] = me.getKey(); } //1、初始點的選擇演算法是隨機選擇或者是均勻分開選擇,這裡採用後者 Map<Integer,Map<String,Double>> meansMap = getInitPoint(allTestSampleMap,k); double [][] distance = new double[tsLength][k]; //distance[i][k]記錄點i到聚類中心k的距離 //2、初始化k個聚類 int[] assignMeans = new int[tsLength]; //記錄所有點屬於的聚類序號,初始化全部為0 Map<Integer,Vector<Integer>> clusterMember = new TreeMap<Integer, Vector<Integer>>();//記錄每個聚類的成員點序號 Vector<Integer> mem = new Vector<Integer>(); int iterNum = 0; //迭代次數 while(true){ System.out.println("Iteration No." + (iterNum++) + "-------------------------"); //3、計算每個點和每個聚類中心的距離 for(int i = 0;i < tsLength;i++){ for(int j = 0;j<k;j++) distance[i][j] = getDistance(allTestSampleMap.get(testSampleNames[i]),meansMap.get(j)); } //4、找出每個點最近的聚類中心 int [] nearestMeans = new int[tsLength]; for(int i = 0;i < tsLength;i++){ nearestMeans[i] = findNearestMeans(distance,i); } //5、判斷當前所有點屬於的聚類序號是否已經全部是其離的最近的聚類,如果是或者達到最大的迭代次數,那麼結束演算法 int okCount = 0; for(int i= 0;i<tsLength;i++){ if(nearestMeans[i] == assignMeans[i]) okCount ++; } System.out.println("okCount = " + okCount); if(okCount == tsLength || iterNum >= 10) break; //6、如果前面條件不滿足,那麼需要重新聚類再次進行一次迭代,需要修改每個聚類的成員和每個點屬於的聚類資訊 clusterMember.clear(); for(int i = 0;i < tsLength;i++){ assignMeans[i] = nearestMeans[i]; if(clusterMember.containsKey(nearestMeans[i])){ clusterMember.get(nearestMeans[i]).add(i); } else{ mem.clear(); mem.add(i); Vector<Integer> tempMem = new Vector<Integer>(); tempMem.addAll(mem); clusterMember.put(nearestMeans[i], tempMem); } } //7、重新計算每個聚類的中心點 for(int i = 0;i<k;i++){ if(!clusterMember.containsKey(i)) //注意kmeans可能產生空聚類 continue; Map<String,Double> newMean = computeNewMean(clusterMember.get(i),allTestSampleMap,testSampleNames); Map<String,Double> tempMean = new TreeMap<String,Double>(); tempMean.putAll(newMean); meansMap.put(i, tempMean); } } //8、形成聚類結果並且返回 Map<String,Integer> resMap = new TreeMap<String,Integer>(); for(int i = 0;i<tsLength;i++){ resMap.put(testSampleNames[i], assignMeans[i]); } return resMap; } /** * 計算當前聚類的新中心,採用向量平均 * @param clusterM 該點到所有聚類中心的距離 * @param allTestSampleMap 所有測試樣例 <檔名,向量> * @param testSampleNames 所有測試樣例名構成的陣列 * @return 新的聚類中心向量 */ private Map<String, Double> computeNewMean(Vector<Integer> clusterM, Map<String, Map<String, Double>> allTestSampleMap, String[] testSampleNames) { double memberNum = (double)clusterM.size(); Map<String,Double> newMeanMap = new TreeMap<String,Double>(); Map<String,Double> currentMemMap = new TreeMap<String, Double>(); for(Iterator<Integer> it = clusterM.iterator();it.hasNext();){ int me = it.next(); currentMemMap = allTestSampleMap.get(testSampleNames[me]); Set<Map.Entry<String, Double>> currentMemMapSet = currentMemMap.entrySet(); for(Iterator<Map.Entry<String, Double>> jt = currentMemMapSet.iterator();jt.hasNext();){ Map.Entry<String, Double> ne = jt.next(); if(newMeanMap.containsKey(ne.getKey())) newMeanMap.put(ne.getKey(), newMeanMap.get(ne.getKey())+ne.getValue()); else newMeanMap.put(ne.getKey(), ne.getValue()); } } Set<Map.Entry<String, Double>> newMeanMapSet = newMeanMap.entrySet(); for(Iterator<Map.Entry<String, Double>> it = newMeanMapSet.iterator();it.hasNext();){ Map.Entry<String, Double> me = it.next(); newMeanMap.put(me.getKey(), newMeanMap.get(me.getKey()) / memberNum); } return newMeanMap; } /** * 找出距離當前點最近的聚類中心 * @param distance 點到所有聚類中心的距離 * @param m 點(文字號) * @return 最近聚類中心的序號j */ private int findNearestMeans(double[][] distance, int m) { double minDist = 10; int j = 0; for(int i = 0;i<distance[m].length;i++){ if(distance[m][i] < minDist){ minDist = distance[m][i]; j = i; } } return j; } /** * 計算兩個點的距離 * @param map1 點1的向量map * @param map2 點2的向量map * @return 兩個點的歐式距離 */ private double getDistance(Map<String, Double> map1, Map<String, Double> map2) { return 1 - computeSim(map1,map2); } /**計算兩個文字的相似度 * @param testWordTFMap 文字1的<單詞,詞頻>向量 * @param trainWordTFMap 文字2<單詞,詞頻>向量 * @return Double 向量之間的相似度 以向量夾角餘弦計算(加上註釋部分程式碼即可)或者向量內積計算(不加註釋部分,效果相當而速度更快) * @throws IOException */ private double computeSim(Map<String, Double> testWordTFMap, Map<String, Double> trainWordTFMap) { // TODO Auto-generated method stub double mul = 0;//, testAbs = 0, trainAbs = 0; Set<Map.Entry<String, Double>> testWordTFMapSet = testWordTFMap.entrySet(); for(Iterator<Map.Entry<String, Double>> it = testWordTFMapSet.iterator(); it.hasNext();){ Map.Entry<String, Double> me = it.next(); if(trainWordTFMap.containsKey(me.getKey())){ mul += me.getValue()*trainWordTFMap.get(me.getKey()); } //testAbs += me.getValue() * me.getValue(); } //testAbs = Math.sqrt(testAbs); /*Set<Map.Entry<String, Double>> trainWordTFMapSet = trainWordTFMap.entrySet(); for(Iterator<Map.Entry<String, Double>> it = trainWordTFMapSet.iterator(); it.hasNext();){ Map.Entry<String, Double> me = it.next(); trainAbs += me.getValue()*me.getValue(); } trainAbs = Math.sqrt(trainAbs);*/ return mul ;/// (testAbs * trainAbs); } /** * 獲取kmeans演算法迭代的初始點 * @param allTestSampleMap <檔名,<特徵詞,TF-IDF值>> * @param k 聚類的數量 * @return meansMap k個聚類的中心點向量 */ private Map<Integer, Map<String, Double>> getInitPoint( Map<String, Map<String, Double>> allTestSampleMap, int k) { int count = 0, i = 0; //儲存k個聚類的中心向量 Map<Integer,Map<String,Double>> meansMap = new TreeMap<Integer, Map<String,Double>>(); System.out.println("本次聚類的初始點對應的檔案為:"); Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet(); for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){ Map.Entry<String, Map<String,Double>> me = it.next(); if(count == i*allTestSampleMapSet.size() / k){ meansMap.put(i, me.getValue()); System.out.println(me.getKey()); i++; } count++ ; } return meansMap; } /** * 輸出聚類結果到檔案中 * @param kmeansClusterResult 聚類結果 * @param kmeansClusterResultFile 輸出聚類結果到檔案中 * @throws IOException */ private void printClusterResult(Map<String, Integer> kmeansClusterResult, String kmeansClusterResultFile) throws IOException { FileWriter resultWriter = new FileWriter(kmeansClusterResultFile); Set<Map.Entry<String, Integer>> kmeansClusterResultSet = kmeansClusterResult.entrySet(); for(Iterator<Map.Entry<String, Integer>> it = kmeansClusterResultSet.iterator();it.hasNext();){ Map.Entry<String, Integer> me = it.next(); resultWriter.append(me.getKey()+" "+me.getValue()+"\n"); } resultWriter.flush(); resultWriter.close(); } /** * 評估函式根據聚類結果檔案統計熵 和 混淆矩陣 * @param kmeansClusterResultFile 聚類結果檔案 * @param k 聚類數目 * @return 聚類結果的熵值 * @throws IOException */ private double evaluateClusterResult(String kmeansClusterResultFile, int k) throws IOException { Map<String,String> rightCate = new TreeMap<String, String>(); Map<String,String> resultCate = new TreeMap<String, String>(); FileReader crReader = new FileReader(kmeansClusterResultFile); BufferedReader crBR = new BufferedReader(crReader); String[] s; String line; while((line = crBR.readLine()) != null){ s = line.split(" "); resultCate.put(s[0], s[1]); rightCate.put(s[0], s[0].split("_")[0]); } crBR.close(); return computeEntropyAndConfuMatrix(rightCate,resultCate,k);//返回熵 } /** * 計算混淆矩陣並輸出,返回熵 * @param rightCate 正確的類目對應map * @param resultCate 聚類結果對應map * @param k 聚類的數目 * @return 返回聚類熵 */ private double computeEntropyAndConfuMatrix(Map<String, String> rightCate, Map<String, String> resultCate, int k) { //k行20列,[i,j]表示聚類i中屬於類目j的檔案數 int[][] confusionMatrix = new int[k][20]; //首先求出類目對應的陣列索引 SortedSet<String> cateNames = new TreeSet<String>(); Set<Map.Entry<String, String>> rightCateSet = rightCate.entrySet(); for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator();it.hasNext();){ Map.Entry<String, String> me = it.next(); cateNames.add(me.getValue()); } String[] cateNamesArray = cateNames.toArray(new String[0]); Map<String,Integer> cateNamesToIndex = new TreeMap<String, Integer>(); for(int i =0;i < cateNamesArray.length ;i++){ cateNamesToIndex.put(cateNamesArray[i], i); } for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator();it.hasNext();){ Map.Entry<String, String> me = it.next(); confusionMatrix[Integer.parseInt(resultCate.get(me.getKey()))][cateNamesToIndex.get(me.getValue())]++; } //輸出混淆矩陣 double [] clusterSum = new double[k]; //記錄每個聚類的檔案數 double [] everyClusterEntropy = new double[k]; //記錄每個聚類的熵 double clusterEntropy = 0; System.out.print(" "); for(int i=0;i<20;i++){ System.out.printf("%-6d",i); } System.out.println(); for(int i =0;i<k;i++){ System.out.printf("%-6d",i); for(int j = 0;j<20;j++){ clusterSum[i] += confusionMatrix[i][j]; System.out.printf("%-6d",confusionMatrix[i][j]); } System.out.println(); } System.out.println(); //計算熵值 for(int i = 0;i<k;i++){ if(clusterSum[i] != 0){ for(int j = 0;j< 20 ;j++){ double p = (double)confusionMatrix[i][j]/clusterSum[i]; if(p!=0) everyClusterEntropy[i] += -p * Math.log(p); } clusterEntropy += clusterSum[i]/(double)rightCate.size() * everyClusterEntropy[i]; } } return clusterEntropy; } public void KmeansClusterMain(String testSampleDir) throws IOException { //首先計算文件TF-IDF向量,儲存為Map<String,Map<String,Double>> 即為Map<檔名,Map<特徵詞,TF-IDF值>> ComputeWordsVector computV = new ComputeWordsVector(); //int k[] = {10,20,30}; 三組分類 int k[] = {20}; Map<String,Map<String,Double>> allTestSampleMap = computV.computeTFMultiIDF(testSampleDir); for(int i =0;i<k.length;i++){ System.out.println("開始聚類,聚成"+k[i]+"類"); String KmeansClusterResultFile = "E:\\DataMiningSample\\KmeansClusterResult\\"; Map<String,Integer> KmeansClusterResult = new TreeMap<String, Integer>(); KmeansClusterResult = doProcess(allTestSampleMap,k[i]); KmeansClusterResultFile += k[i]; printClusterResult(KmeansClusterResult,KmeansClusterResultFile); System.out.println("The Entropy for this Cluster is " + evaluateClusterResult(KmeansClusterResultFile,k[i])); } } public static void main(String[] args) throws IOException { KmeansCluster test = new KmeansCluster(); String KmeansClusterResultFile = "E:\\DataMiningSample\\KmeansClusterResult\\20"; System.out.println("The Entropy for this Cluster is " + test.evaluateClusterResult(KmeansClusterResultFile,20)); } }
4、程式入口
package com.datamine.kmeans; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; public class ClusterMain { /** * Kmeans 聚類主程式入口 * @param args * @throws IOException */ public static void main(String[] args) throws IOException { //資料預處理 在分類演算法中已經實現 這裡(略) ComputeWordsVector computeV = new ComputeWordsVector(); KmeansCluster kmeansCluster = new KmeansCluster(); String srcDir = "E:\\DataMiningSample\\processedSample\\"; String desDir = "E:\\DataMiningSample\\clusterTestSample\\"; SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String beginTime = sdf.format(new Date()); System.out.println("程式開始執行時間:"+beginTime); String[] terms = computeV.createTestSamples(srcDir,desDir); kmeansCluster.KmeansClusterMain(desDir); String endTime = sdf.format(new Date()); System.out.println("程式結束執行時間:"+endTime); } }
5、聚類結果
程式開始執行時間:2016-03-14 17:02:38
special words map sizes:3832
the total number of test files is 18828
開始聚類,聚成20類
本次聚類的初始點對應的檔案為:
alt.atheism_49960
comp.graphics_38307
comp.os.ms-windows.misc_10112
comp.sys.ibm.pc.hardware_58990
comp.sys.mac.hardware_50449
comp.windows.x_66402
comp.windows.x_68299
misc.forsale_76828
rec.autos_103685
rec.motorcycles_105046
rec.sport.baseball_104941
rec.sport.hockey_54126
sci.crypt_15819
sci.electronics_54016
sci.med_59222
sci.space_61185
soc.religion.christian_20966
talk.politics.guns_54517
talk.politics.mideast_76331
talk.politics.misc_178699
Iteration No.0-------------------------
okCount = 512
Iteration No.1-------------------------
okCount = 10372
Iteration No.2-------------------------
okCount = 15295
Iteration No.3-------------------------
okCount = 17033
Iteration No.4-------------------------
okCount = 17643
Iteration No.5-------------------------
okCount = 18052
Iteration No.6-------------------------
okCount = 18282
Iteration No.7-------------------------
okCount = 18404
Iteration No.8-------------------------
okCount = 18500
Iteration No.9-------------------------
okCount = 18627
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
0 482 0 3 3 1 1 0 5 2 1 0 0 2 27 11 53 4 6 15 176
1 4 601 69 8 14 127 7 5 5 8 0 14 31 16 34 2 2 2 1 5
2 1 64 661 96 18 257 26 9 3 0 0 13 25 13 6 2 3 2 6 2
3 0 56 78 575 213 15 119 15 6 2 1 4 131 2 4 2 6 0 2 1
4 1 25 13 151 563 11 50 3 3 1 2 14 125 4 8 1 0 3 0 0
5 2 28 78 25 37 348 13 2 0 0 2 5 38 5 6 2 1 1 2 8
6 20 80 24 21 23 166 38 45 45 26 10 37 87 34 27 22 15 8 35 12
7 4 20 6 24 45 6 629 28 20 14 0 3 87 10 4 1 8 0 13 0
8 0 2 1 10 8 4 25 781 40 1 1 0 70 5 10 2 8 4 2 3
9 4 2 11 0 1 1 11 34 831 1 0 1 7 7 0 1 1 1 8 0
10 10 7 6 2 4 1 7 7 4 633 4 5 11 18 9 5 13 8 10 3
11 1 0 1 9 4 1 20 1 3 286 961 0 17 8 4 2 2 0 5 3
12 3 14 0 6 1 2 2 0 1 1 0 858 51 1 1 2 16 8 69 4
13 3 15 4 7 7 17 5 12 8 5 2 5 46 13 793 6 5 2 30 5
14 2 4 0 1 0 2 4 6 3 4 4 2 14 746 3 1 2 3 55 11
15 30 43 29 39 15 18 12 13 7 3 4 13 195 38 36 5 6 18 5 11
16 195 1 0 2 0 1 1 0 4 1 4 1 4 16 6 846 3 6 16 274
17 8 2 0 2 4 2 1 5 7 0 0 10 30 12 5 28 363 9 289 23
18 19 1 0 0 2 0 0 6 0 1 1 3 1 3 2 9 8 843 48 18
19 10 8 1 1 1 0 2 13 2 6 3 3 9 12 18 5 444 16 164 69
The Entropy for this Cluster is 1.2444339205006887
程式結束執行時間:2016-03-14 17:08:24