《機器學習實戰》PCA原始碼
阿新 • • 發佈:2018-11-21
#coding:utf-8 from numpy import * """ 將資料轉換成前N個主成分的虛擬碼如下: 去除平均值 計算協方差矩陣 計算協方差矩陣的特徵值和特徵向量 將特徵值從大到小排序 保留最上面的N個特徵向量 將資料轉換到上述N個特徵向量構建的新空間中 """ def loadDataSet(filename,delim=' '): fr = open(filename) stringArr = [line.strip().split(delim) for line in fr.readlines()] dataArr = [list(map(float,line)) for line in stringArr] #map()的結果是惰性的,必須加list使其返回list #print(shape(stringArr)) #print(shape(dataArr)) return mat(dataArr) def pca(dataMat,topNfeat=99999): #topNfeat是應用的N個特徵,也就是降到topNfeat維 meanVals = mean(dataMat,axis=0) #axis=0表示是對行壓縮,即對列求平均,返回1*n的矩陣 print(type(meanVals)) meanRemoved = dataMat-meanVals covMat = cov(meanRemoved,rowvar=0) #如果`rowvar`為True(預設值),則每行代表一個變數,並在列中顯示。 否則,轉換關係:每列代表一個變數,在行中顯示。 print(covMat) eigVals,eigVects = linalg.eig(mat(covMat)) eigValInd = argsort(eigVals) eigValInd = eigValInd[:-(topNfeat+1):-1] redEigVects = eigVects[:,eigValInd] lowDDataMat = meanRemoved * redEigVects reconMat = (lowDDataMat * redEigVects.T) + meanVals return lowDDataMat,reconMat if __name__ == "__main__": Add="D:\PycharmProjects\PCA\dataTest.txt" dataMat= loadDataSet(Add) lowDMat,reconMat = pca(dataMat,1) #降成1維 print(shape(lowDMat)) print(shape(reconMat)) import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],marker="*",s=90,c="b") #ax.scatter(lowDMat[:,0].flatten().A[0],lowDMat[:,1].flatten().A[0],marker="o",s=50,c="red") ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker="o", s=40, c="r") plt.show()
平均值帶缺失值的函式:
#將NaN替換成平均值 def replaceNaNWithMean(): dataMat = loadDataSet() numFeat = shape(dataMat)[1] for i in range(numFeat): #計算所有非NAN的平均值 meanVal = mean(dataMat[nonzero(~isnan(dataMat[:,i].A))[0],i]) dataMat[nonzero(isnan(dataMat[:,i].A))[0],i] = meanVal return dataMat