Machine Learning 之 LOF離群點檢驗
阿新 • • 發佈:2019-01-08
#coding=utf-8
#本質是基於密度的檢測 缺點:計算量巨大
#優化 重複點計算
import math
print sorted([1,3,2])[:1],[1,3,2][1:]
class LOF:
def __init__(self,data,k,threshold):
self.data=data
self.k=k
self.threshold=threshold
self.outliners=[]
def __calDistance(self,a,b):
sum1=0
for k in range(len(a)):
sum1+=((a[k]-b[k])**2)
sum1=math.sqrt(sum1)
if sum1==0:
sum1=0.00000001
return sum1
def __calNk(self,point):
Nk=[]
disList=[]
for j in range(len(self.data)):
dis=self.__calDistance(point,self.data[j])
disList.append([dis,data[j]])
distList=sorted(distList)
distance=distList[k-1 ][0]
distList=distList[0:k-1]
disList2=[]
for di in distList[k-1:]:
if di[0]==distList[k-1][0]:
disList2.append(di)
Nk=distList+disList2
Nk=[nk[1] for nk in Nk]
return Nk,distance
def __getReachDis(self,point,Nk):
reachDis=[]
for nk in Nk:
Nk1,dis1=self.__calNk(point)
dis2=self.__calNk(point,nk)
reachDis.append(max(distance,distance2))
return reachDis
def __getLrd(self,point):
Nk,distance=self.__calNk(point)
reachDis=self.__getReachDis(point,Nk)
lrdPoint=1.0*sum(reachDis)/len(reachDis)
return lrdPoint,Nk
def __getLrdList(self,num):
lrdList=[]
lrdPoint,Nk=self.__getLrd(self.data[num])
for l in range(len(Nk)):
lr,=self.__getLrd(Nk(l))
lrdList.append(lr)
return lrdPoint,lrdList
def __getLOF(self,num):
lrdPoint,lrdList=self.__getLrdList(num)
lofValue=0
for lrd in lrdList:
lofValue+=1.0*(lrd/lrdPoint)
lofValue=lofValue/len(lrdList)
return self.data[i],lofValue
def run(self):
for i in range(len(data)):
lofP,lofV=self.__getLOF(i)
if lofV>self.threshold:
self.outliners.append(lofP)
self.outliners=list[(self.outliners)]
return self.outliners