1. 程式人生 > >機器學習實戰PCA演算法

機器學習實戰PCA演算法

1、pca演算法

def pca(dataMat, topNfeat=9999999):
    meanVals = mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals #remove mean
    covMat = cov(meanRemoved, rowvar=0)
    eigVals,eigVects = linalg.eig(mat(covMat))
    eigValInd = argsort(eigVals)            #sort,tsort goes smallest to largest
    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest
    lowDDataMat = meanRemoved * redEigVect5is#transform data into new dimensions
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat, reconMat

in[1]:from imp import reload
in[2]:import pca
in[3]:datamat = pca.loadDataSet(r'C:\Users\16793\Desktop\machinelearninginaction\Ch05\testSet.txt')
in[4]:datamat
Out[5]: 
matrix([[-1.7612000e-02,  1.4053064e+01,  0.0000000e+00],
        [-1.3956340e+00,  4.6625410e+00,  1.0000000e+00],
        [-7.5215700e-01,  6.5386200e+00,  0.0000000e+00],
        [-1.3223710e+00,  7.1528530e+00,  0.0000000e+00],
        [ 4.2336300e-01,  1.1054677e+01,  0.0000000e+00],
        [ 4.0670400e-01,  7.0673350e+00,  1.0000000e+00],
        [ 6.6739400e-01,  1.2741452e+01,  0.0000000e+00],
        [-2.4601500e+00,  6.8668050e+00,  1.0000000e+00],
        [ 5.6941100e-01,  9.5487550e+00,  0.0000000e+00],
        [-2.6632000e-02,  1.0427743e+01,  0.0000000e+00],
        [ 8.5043300e-01,  6.9203340e+00,  1.0000000e+00],
        [ 1.3471830e+00,  1.3175500e+01,  0.0000000e+00],
        [ 1.1768130e+00,  3.1670200e+00,  1.0000000e+00],
        [-1.7818710e+00,  9.0979530e+00,  0.0000000e+00],
        [-5.6660600e-01,  5.7490030e+00,  1.0000000e+00],
        [ 9.3163500e-01,  1.5895050e+00,  1.0000000e+00],
        [-2.4205000e-02,  6.1518230e+00,  1.0000000e+00],
        [-3.6453000e-02,  2.6909880e+00,  1.0000000e+00],
        [-1.9694900e-01,  4.4416500e-01,  1.0000000e+00],
        [ 1.0144590e+00,  5.7543990e+00,  1.0000000e+00],
        [ 1.9852980e+00,  3.2306190e+00,  1.0000000e+00],
        [-1.6934530e+00, -5.5754000e-01,  1.0000000e+00],
        [-5.7652500e-01,  1.1778922e+01,  0.0000000e+00],
        [-3.4681100e-01, -1.6787300e+00,  1.0000000e+00],
        [-2.1244840e+00,  2.6724710e+00,  1.0000000e+00],
        [ 1.2179160e+00,  9.5970150e+00,  0.0000000e+00],
        [-7.3392800e-01,  9.0986870e+00,  0.0000000e+00],
        [-3.6420010e+00, -1.6180870e+00,  1.0000000e+00],
        [ 3.1598500e-01,  3.5239530e+00,  1.0000000e+00],
        [ 1.4166140e+00,  9.6192320e+00,  0.0000000e+00],
        [-3.8632300e-01,  3.9892860e+00,  1.0000000e+00],
        [ 5.5692100e-01,  8.2949840e+00,  1.0000000e+00],
        [ 1.2248630e+00,  1.1587360e+01,  0.0000000e+00],
        [-1.3478030e+00, -2.4060510e+00,  1.0000000e+00],
        [ 1.1966040e+00,  4.9518510e+00,  1.0000000e+00],
        [ 2.7522100e-01,  9.5436470e+00,  0.0000000e+00],
        [ 4.7057500e-01,  9.3324880e+00,  0.0000000e+00],
        [-1.8895670e+00,  9.5426620e+00,  0.0000000e+00],
        [-1.5278930e+00,  1.2150579e+01,  0.0000000e+00],
        [-1.1852470e+00,  1.1309318e+01,  0.0000000e+00],
        [-4.4567800e-01,  3.2973030e+00,  1.0000000e+00],
        [ 1.0422220e+00,  6.1051550e+00,  1.0000000e+00],
        [-6.1878700e-01,  1.0320986e+01,  0.0000000e+00],
        [ 1.1520830e+00,  5.4846700e-01,  1.0000000e+00],
        [ 8.2853400e-01,  2.6760450e+00,  1.0000000e+00],
        [-1.2377280e+00,  1.0549033e+01,  0.0000000e+00],
        [-6.8356500e-01, -2.1661250e+00,  1.0000000e+00],
        [ 2.2945600e-01,  5.9219380e+00,  1.0000000e+00],
        [-9.5988500e-01,  1.1555336e+01,  0.0000000e+00],
        [ 4.9291100e-01,  1.0993324e+01,  0.0000000e+00],
        [ 1.8499200e-01,  8.7214880e+00,  0.0000000e+00],
        [-3.5571500e-01,  1.0325976e+01,  0.0000000e+00],
        [-3.9782200e-01,  8.0583970e+00,  0.0000000e+00],
        [ 8.2483900e-01,  1.3730343e+01,  0.0000000e+00],
        [ 1.5072780e+00,  5.0278660e+00,  1.0000000e+00],
        [ 9.9671000e-02,  6.8358390e+00,  1.0000000e+00],
        [-3.4400800e-01,  1.0717485e+01,  0.0000000e+00],
        [ 1.7859280e+00,  7.7186450e+00,  1.0000000e+00],
        [-9.1880100e-01,  1.1560217e+01,  0.0000000e+00],
        [-3.6400900e-01,  4.7473000e+00,  1.0000000e+00],
        [-8.4172200e-01,  4.1190830e+00,  1.0000000e+00],
        [ 4.9042600e-01,  1.9605390e+00,  1.0000000e+00],
        [-7.1940000e-03,  9.0757920e+00,  0.0000000e+00],
        [ 3.5610700e-01,  1.2447863e+01,  0.0000000e+00],
        [ 3.4257800e-01,  1.2281162e+01,  0.0000000e+00],
        [-8.1082300e-01, -1.4660180e+00,  1.0000000e+00],
        [ 2.5307770e+00,  6.4768010e+00,  1.0000000e+00],
        [ 1.2966830e+00,  1.1607559e+01,  0.0000000e+00],
        [ 4.7548700e-01,  1.2040035e+01,  0.0000000e+00],
        [-7.8327700e-01,  1.1009725e+01,  0.0000000e+00],
        [ 7.4798000e-02,  1.1023650e+01,  0.0000000e+00],
        [-1.3374720e+00,  4.6833900e-01,  1.0000000e+00],
        [-1.0278100e-01,  1.3763651e+01,  0.0000000e+00],
        [-1.4732400e-01,  2.8748460e+00,  1.0000000e+00],
        [ 5.1838900e-01,  9.8870350e+00,  0.0000000e+00],
        [ 1.0153990e+00,  7.5718820e+00,  0.0000000e+00],
        [-1.6580860e+00, -2.7255000e-02,  1.0000000e+00],
        [ 1.3199440e+00,  2.1712280e+00,  1.0000000e+00],
        [ 2.0562160e+00,  5.0199810e+00,  1.0000000e+00],
        [-8.5163300e-01,  4.3756910e+00,  1.0000000e+00],
        [-1.5100470e+00,  6.0619920e+00,  0.0000000e+00],
        [-1.0766370e+00, -3.1818880e+00,  1.0000000e+00],
        [ 1.8210960e+00,  1.0283990e+01,  0.0000000e+00],
        [ 3.0101500e+00,  8.4017660e+00,  1.0000000e+00],
        [-1.0994580e+00,  1.6882740e+00,  1.0000000e+00],
        [-8.3487200e-01, -1.7338690e+00,  1.0000000e+00],
        [-8.4663700e-01,  3.8490750e+00,  1.0000000e+00],
        [ 1.4001020e+00,  1.2628781e+01,  0.0000000e+00],
        [ 1.7528420e+00,  5.4681660e+00,  1.0000000e+00],
        [ 7.8557000e-02,  5.9736000e-02,  1.0000000e+00],
        [ 8.9392000e-02, -7.1530000e-01,  1.0000000e+00],
        [ 1.8256620e+00,  1.2693808e+01,  0.0000000e+00],
        [ 1.9744500e-01,  9.7446380e+00,  0.0000000e+00],
        [ 1.2611700e-01,  9.2231100e-01,  1.0000000e+00],
        [-6.7979700e-01,  1.2205300e+00,  1.0000000e+00],
        [ 6.7798300e-01,  2.5566660e+00,  1.0000000e+00],
        [ 7.6134900e-01,  1.0693862e+01,  0.0000000e+00],
        [-2.1687910e+00,  1.4363200e-01,  1.0000000e+00],
        [ 1.3886100e+00,  9.3419970e+00,  0.0000000e+00],
        [ 3.1702900e-01,  1.4739025e+01,  0.0000000e+00]])
in[5]:datamat = pca.loadDataSet(r'C:\Users\16793\Desktop\machinelearninginaction\Ch13\testSet.txt')
in[6]:datamat
Out[7]: 
matrix([[10.235186, 11.321997],
        [10.122339, 11.810993],
        [ 9.190236,  8.904943],
        ...,
        [ 9.854922,  9.201393],
        [ 9.11458 ,  9.134215],
        [10.334899,  8.543604]])
in[7]:from numpy import *
in[8]:topnfeat = 1
in[9]:meanvals = mean(datamat, axis = 0 )
in[10]:meanvals
Out[11]: matrix([[9.06393644, 9.09600218]])
in[11]:meanremoved = datamat - meanvals
in[12]:meanremoved
Out[13]: 
matrix([[ 1.17124956,  2.22599482],
        [ 1.05840256,  2.71499082],
        [ 0.12629956, -0.19105918],
        ...,
        [ 0.79098556,  0.10539082],
        [ 0.05064356,  0.03821282],
        [ 1.27096256, -0.55239818]])
in[14]:covmat = cov(meanremoved, rowvar = 0)
in[15]:covmat
Out[16]: 
array([[1.05198368, 1.1246314 ],
       [1.1246314 , 2.21166499]])

in[16]:eigvals, eigvects = linalg.eig(mat(covmat))
in[17]:eigvals
Out[18]: array([0.36651371, 2.89713496])
in[19]:eigvects
Out[20]: 
matrix([[-0.85389096, -0.52045195],
        [ 0.52045195, -0.85389096]])
in[20]:eigvalind = argsort(eigvals)
in[21]:eigvalind
Out[22]: array([0, 1], dtype=int64)
in[22]:eigvalind = eigvalind[:-(topnfeat+1):-1]
in[23]:eigvalind
Out[24]: array([1], dtype=int64)
in[24]:redeigvects = eigvects[:,eigvalind]
in[25]:redeigvects
Out[26]: 
matrix([[-0.52045195],
        [-0.85389096]])
in[26]:rowddatamat =  meanremoved *  redeigvects

in[27]:rowddatamat
Out[38]: 
matrix([[-2.51033597e+00],
        [-2.86915379e+00],
        [ 9.74108510e-02],
        [-7.67782222e-01],
        [ 1.02715333e+00],
        [-1.44409178e+00],
        [-2.17360352e+00],
        [-7.73998803e-01],
        [-1.09983463e+00],
        [-1.70275987e+00],
        [-5.39605615e-01],
        [-9.15572638e-01],
        [-2.42669452e+00],

        .......

        [-5.01662249e-01],
        [-5.89871235e-02],
        [-1.89787138e-01]])

in[27]:reconmat = (rowddatamat * redeigvects.T) + meanvals
in[28]:reconmat
Out[29]: 
matrix([[10.37044569, 11.23955536],
        [10.55719313, 11.54594665],
        [ 9.01323877,  9.01282393],
        ...,
        [ 9.32502753,  9.52436704],
        [ 9.0946364 ,  9.14637075],
        [ 9.16271152,  9.2580597 ]])
in[29]:shape(reconmat)
Out[30]: (1000, 2)

 因為對矩陣操作的過程中矩陣的變化不瞭解,所以一行一行的輸出,來體現矩陣的變化。第一次寫部落格哈哈,寫的好爛!