- 显示PCA中主成分方差占比情况:
from numpy import *import numpy as npimport matplotlib.pyplot as pltfrom matplotlib.pyplot import MultipleLocatordef pca(dataMat, topNfeat=9999999):meanVals = mean(dataMat, axis=0) # calculate the every feat meanmeanRemoved = dataMat - meanVals #remove meancovMat = cov(meanRemoved, rowvar=0)eigVals,eigVects = linalg.eig(np.mat(covMat))#print(eigVals)eigValInd = argsort(eigVals) #sort, sort goes smallest to largest, and return its index.eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions#plotPolyline(eigVals)#reorganize eig vects largest to smallestredEigVects = eigVects[:,eigValInd]lowDDataMat = meanRemoved * redEigVects #transform data into new dimensionsreconMat = (lowDDataMat * redEigVects.T) + meanVals # Adding meanVals means every point add the bias data# translate the coordinate by adding meanValsreturn lowDDataMat, reconMatdef plotPolyline(eigVals): # eigVals表示根据PCA所获得特征值(可以理解为方差)eigValsSorted= eigVals[argsort(-eigVals)] # 对输入的numpy.narray进行排序处理,从大到小排序covarPercent = eigValsSorted/ eigValsSorted.sum()
plt.figure()X = np.arange(1, len(covarPercent) + 1)# plt.stem(X, covarPercent, markerfmt='C1o')# plt.plot(X, covarPercent, marker='^', color='r', ls='-', lw=3) #折线图plt.plot(X, covarPercent, 'r^-') #折线图plt.title("{}".format("主成分方差占比情况"))#显示图名
ax=plt.gca() #ax为两条坐标轴的实例x_major_locator = MultipleLocator(5)ax.xaxis.set_major_locator(x_major_locator)
plt.xlim(0, 20)plt.xlabel('主成分目录')plt.ylabel('方差的百分比')plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']plt.show()
运行结果,如图所示:
结论:通过画出折线图,可以了解到保留前面的6个主成分,数据集可以从590个特征缩减成6个特征,大概实现100:1的压缩。
优点:降维技术使得数据更易使用,技术包括有独立成分分析、因子分析和主成分分析,主成分分析最为流行。可以从数据中识别主要特征。
缺点:由于需要将所有数据都调入内存,如果无法做到,就需要使用其它方法寻找其特征值。
参考:
3.Matplotlib坐标轴显示问题以及设置X,Y显示范围
4.《机器学习实战》P250页