注释:由于各方面原因,理论部分不做介绍,网上很多自行百度吧!
pca.py
1 import numpy as np 2 import matplotlib.pyplot as plt 3 import math 4 5 def loadDataSet(filename, delin = ' '): 6 fr = open(filename) 7 #读取分割存入数组 8 stringArr = [line.strip().split(delin) for line in fr.readlines()] 9 dataArr = [list(map(float,line)) for line in stringArr] 10 return np.mat(dataArr) 11 def pca(dataMat, topNfeet = 9999999): 12 meanVals = np.mean(dataMat,axis=0)#求取平均值 13 meanRemoved = dataMat - meanVals 14 covMat = np.cov(meanRemoved,rowvar=0)#方差 15 eigVals, eigVects= np.linalg.eig(np.mat(covMat))#求解特征向量和特征值 16 eigValInd = np.argsort(eigVals)#对特征值进行排序 17 eigValInd = eigValInd[:-(topNfeet+1):-1]#最后的-1是防止越界的,当然你可以在前面加一个判断 18 redEigVects = eigVects[:,eigValInd] 19 lowDDataMat = meanRemoved*redEigVects # 20 reconMat = (lowDDataMat * redEigVects.T) + meanVals 21 return lowDDataMat, reconMat
main.py
1 import PCA 2 import matplotlib.pyplot as plt 3 4 if __name__ == "__main__": 5 6 dataMat = PCA.loadDataSet('testSet.txt') 7 lowDMat, reconMat = PCA.pca(dataMat,1) 8 fig = plt.figure() 9 ax = fig.add_subplot(111) 10 ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],marker = '^',s=90) 11 ax.scatter(reconMat[:,0].flatten().A[0],reconMat[:,1].flatten().A[0],marker = "o",s=50,c='red') 12 plt.show()
对丢失的值进行替代:
1 #零的数据都转化为平均值 2 def replaceNanWithMean(): 3 dataMat = loadDataSet('secom.data',' ') 4 numFeat = dataMat.shape[1] 5 for i in range(numFeat): 6 meanVal = np.mean(dataMat[np.nonzero(~np.isnan(dataMat[:,i].A))[0],i]) 7 dataMat[np.nonzero(np.isnan(dataMat[:,i].A))[0],i] = meanVal 8 return dataMat