基於密度的聚類演算法(DBSCAN)的java實現
阿新 • • 發佈:2019-01-04
k-means和EM演算法適合發現凸型的聚類(大概就是圓形,橢圓形比較規則的類),而對於非凸型的聚類,這兩種方法就很難找到準確的聚類了。比如如下圖:
可能來自不同類的點反而比來自相同類的點還要靠的更近。
太多的原理和演算法介紹,大家可以找到很多相關資料。(推薦《Data Mining and Analysis: FundamentalConcepts and Algorithms》)。下面的程式碼是對基於密度聚類演算法的一種實現。希望能夠幫助想要學習了理解這種演算法的同學。
import java.util.ArrayList; import java.util.List; /** * * @author aturbo * 基於密度的聚類演算法 */ public class MyDBSCAN { private static final double[][] points = { {3.0, 8.04}, {4.0, 7.95}, {4.4, 8.58}, {3.6, 8.81}, {5.0, 8.33}, {6.0, 6.96}, {17.0, 4.24}, {18.0, 4.26}, {16.0, 3.84}, {17.0, 4.82}, {15.0, 5.68}, {17.0, 5.68}, {11.0, 10.68}, {13.0, 9.68}, {11.8, 10.0}, {12.0, 11.18}, {8.0, 12.0}, {9.2, 9.68}, {8.8, 11.2}, {10.0,11.4}, {7.0, 9.68}, {6.1, 10.68}, {5.70, 1.68}, {5.0, 2.68}, {12.0, 0.68} }; private static int minpts = 6; private static double radius = 1.3; private static List<List<double[]>> clusters; private static List<double[]> cores; /** * 歐氏距離 * @param point1 * @param point2 * @return */ private static double countEurDistance(double[] point1,double[] point2){ double eurDistance = 0.0; for(int i=0;i<point1.length;i++){ eurDistance += (point1[i]-point2[i])*(point1[i]-point2[i]); } return Math.sqrt(eurDistance); } /** * find the core points * @param points * @param minpts * @param radius * @return */ private static List<double[]> findCores(double[][] points,int minpts,double radius){ List<double[]> cores = new ArrayList<double[]>(); for(int i = 0; i < points.length;i++){ int pts = 0; for(int j = 0; j < points.length;j++){ for(int k = 0; k < points[i].length;k++){ if(countEurDistance(points[i], points[j])<radius){ pts++; } } } if(pts>=minpts){ cores.add(points[i]); } } return cores; } /** * put the core point to cluster and get the densityconnect */ private static void putCoreToCluster(){ clusters = new ArrayList<List<double[]>>(); int clusterNum = 0; for(int i = 0;i<cores.size();i++){ clusters.add(new ArrayList<double[]>()); clusters.get(clusterNum).add(cores.get(i)); densityConnected(points, cores.get(i), clusterNum); clusterNum++; } } /** * * @param points * @param core * @param clusterNum */ private static void densityConnected(double[][] points,double[] core,int clusterNum){ boolean isputToCluster;//是否已經歸為某個類 boolean isneighbour = false;//是不是core的“鄰居” cores.remove(core);//對某個core點處理後就從core集中去掉 for(int i = 0; i < points.length;i++){ isneighbour = false; isputToCluster = false; for(List<double[]> cluster:clusters){ if(cluster.contains(points[i])){//如果已經歸為某個類 isputToCluster = true; break; } } if(isputToCluster)continue;//已在聚類中,跳過,不處理 if(countEurDistance(points[i], core)<radius){//是目前加入的core點的“鄰居”嗎?,ture的話,就和這個core加入一個類 clusters.get(clusterNum).add(points[i]); isneighbour = true; } if(isneighbour){//如果是鄰居,才會接下來對鄰居進行densityConnected處理,否則,結束這個core點的處理 if(cores.contains(points[i])){ cores.remove(points[i]); densityConnected(points, points[i], clusterNum); } } } } public static void main(String[] args){ cores = findCores(points, minpts, radius); System.out.println("點的個數:"+points.length); System.out.println(cores.size()+" core points:"); for(double[] core:cores){ System.out.print("["); for(int i = 0;i< core.length;i++){ System.out.print(core[i]); if(i!=(core.length-1)) System.out.print(","); } System.out.print("]"); System.out.println(); } putCoreToCluster(); int i = 0; for(List<double[]> cluster:clusters){ System.out.println("cluster "+ i++ +":"); for(double[] point:cluster){ System.out.println("["+point[0]+","+point[1]+"]"); } } int flag = 0; for(int j = 0;j<points.length;j++){ flag = 0; for(List<double[]> cluster:clusters){ if(cluster.contains(points[j])){ flag = 1; break; } } if(flag==0)System.out.println("noise point:"+"["+points[j][0]+","+points[j][1]+"]"); } } }
具體演算法流程:
參考文獻:
《Data Mining and Analysis: FundamentalConcepts and Algorithms》