寫給自己看的DBSCAN(1):基本實現
阿新 • • 發佈:2020-07-19
搬運自我的CSDN https://blog.csdn.net/u013213111/article/details/107308563
參考:西瓜書
DBSCAN的思想是基於密度來聚類,十分直觀易懂,更嚴謹的描述可見西瓜書,其中個人認為最關鍵的是:
若\(x\)為核心物件,由\(x\)密度可達的所有樣本組成的集合記為\(X=\{x' \in D \mid x'由x密度可達\}\),則不難證明\(X\)即為滿足連線性與最大性的簇。
這就指明瞭實現的一種思路:先找到所有的核心物件,再找到這些核心物件密度可達的其他點。
虛擬碼如下:
這裡給出C++的實現,基本上忠於上述的虛擬碼,沒有對效能進行優化:
struct clusterData { int coordinates[2]; //coordinate[0]:x, coordinate[1]:y int clusterIndex = 0; int dataType = 0; //0:noise,1:boundary,2:core }; /********* dbscan_cpp Summary: Density-Based Spatial Clustering of Applications with Noise inplemented in C++ Parameters: cluster: an array contains all points, points' clusterIndex and dataType are initiated as 0 totalPts: number of points in cluster Return: number of clusters *********/ int dbscan_cpp(clusterData *cluster, const int totalPts, const double eps, const unsigned int minPts) { vector<int> coreObj; vector<set<int>> neighbors(totalPts); for (int j = 0; j < totalPts; j++) { for (int i = 0; i < totalPts; i++) { double dist = sqrt(pow((cluster[j].coordinates[0] - cluster[i].coordinates[0]), 2) + pow((cluster[j].coordinate[1] - cluster[i].coordinate[1]), 2)); if (dist <= eps) neighbors[j].insert(i); } if (neighbors[j].size() >= minPts) coreObj.push_back(j); } set<int> unvisitedPts; for (int i = 0; i < totalPts; i++) unvisitedPts.insert(i); int k = 1; //the index of first cluster is 1, not 0 vector<set<int>> C; while (coreObj.size() > 0) { set<int> unvisitedPtsOld(unvisitedPts.begin(), unvisitedPts.end()); int omg = coreObj[0]; list<int> Q; Q.push_back(omg); unvisitedPts.erase(omg); while (Q.size() > 0) { int q = Q.front(); Q.remove(q); cluster[q].clusterIndex = k; if (neighbors[q].size() >= minPts) { cluster[q].dataType = 2; set<int> delta; set_intersection(unvisitedPts.begin(), unvisitedPts.end(), neighbors[q].begin(), neighbors[q].end(), inserter(delta, delta.begin())); Q.insert(Q.end(), delta.begin(), delta.end()); set<int> diff; set_difference(unvisitedPts.begin(), unvisitedPts.end(), delta.begin(), delta.end(), inserter(diff, diff.begin())); unvisitedPts.clear(); copy(diff.begin(), diff.end(), std::inserter(unvisitedPts, unvisitedPts.end())); } else cluster[q].dataType = 1; } k = k + 1; set<int> c; set_difference(unvisitedPtsOld.begin(), unvisitedPtsOld.end(), unvisitedPts.begin(), unvisitedPts.end(), inserter(c, c.begin())); C.push_back(c); set<int> diff; sort(coreObj.begin(), coreObj.end()); set_difference(coreObj.begin(), coreObj.end(), c.begin(), c.end(), inserter(diff, diff.begin())); coreObj.assign(diff.begin(), diff.end()); } return k-1; }
這裡還有一份MATLAB的實現可供參考。