如果数据集中缺失数据,将其替换成所属特征的平均值。
假如文件forTest.txt中以下数据集:
2,2,2,NaN,2 3,3,NaN,3,3 4,NaN,4,4,4
加载和替换函数:
from numpy import * def replaceNanWithMean(filename): dataMat=loadDataSet(filename,',') numFeat=shape(dataMat)[1] for i in range(numFeat): meanVal=mean(dataMat[nonzero(~isnan(dataMat[:,i].A))[0],i]) dataMat[nonzero(isnan(dataMat[:,i].A))[0],i]=meanVal return dataMat def loadDataSet(filename,delim=' '): f=open(filename) stringArr=[line.strip().split(delim) for line in f.readlines()] dataArr=[list(map(float,line)) for line in stringArr] return mat(dataArr) dataMat=replaceNanWithMean('forTest.txt') print(dataMat)
输出:
[[2. 2. 2. 3.5 2. ] [3. 3. 3. 3. 3. ] [4. 2.5 4. 4. 4. ]]