摘要: 仅用于记录R语言学习过程:
内容提要:
缺失值的识别与处理;
异常值与重复值的处理
正文:
缺失值的识别与处理
导读:
> x <- c(1,2,3,NA,NA,4)
> mean(x)
[1] NA
> sum(x)
[1] NA
> mean(x,na.rm = TRUE)
[1] 2.5
> sum(x,na.rm = TRUE)
[1] 10
> is.na(x)
[1] FALSE FALSE FALSE TRUE TRUE FALSE
> sum(is.na(x))
[1] 2
n 用[! ]去掉缺失值
u 示例:
> x[!is.na(x)]
[1] 1 2 3 4
> iris_na <- iris
> for (i in 1:4) {
+ iris_na[sample(1:nrow(iris),5),i] = NA
+ }
>
> sapply(iris_na[,1:4],function(x)sum(is.na(x)))
我 是 中 国
5 5 5 5
> sapply(iris_na[,1:4],function(x)which(is.na(x))) #which返回的是位置
我 是 中 国
[1,] 10 11 6 28
[2,] 60 52 21 108
[3,] 92 54 32 111
[4,] 108 113 79 124
[5,] 144 136 83 135
u psych扩展包里describe()函数,用于描述数据集的基本统计值
u 计算缺失值的比例:
l 示例:
> sapply(iris_na[,1:4],function(x)sum(is.na(x)/nrow(iris_na)))
我 是 中 国
0.03333333 0.03333333 0.03333333 0.03333333
n 缺失值对于回归分析的影响
u 示例:
u lm(Sepal.length~Sepal.Width,data = iris_na,na.action = na.omit) #在进行回归分析是也要去除缺失值,用na.action,但其默认也是na.action=na.omit
n 缺失值得填补:一般采用中数和中位数,具体如下
u 示例1:
mean_value <- sapply(iris_na[,1:4],mean,na.rm = TRUE)
for (i in 1:4) {
iris_na[is.na(iris_na[,i],i)] = mean_value[i]
}
u 示例2:
> cancer[sample(1:1000,90),2] <- NA #生成缺失值
> mean_value <- tapply(cancer$sur_days,list(cancer$type,cancer$treatment),mean,na.rm = TRUE)
> mean_value #求均值
chemo sugr
colon 523.6489 539.9329
liver 530.1656 576.6056
lung 547.7427 553.1241
> for (i in 1:3){
+ for (j in 1:2){
+ cancer$sur_days[is.na(cancer$sur_days) & cancer$type == rownames(mean_value)[i]
+ & cancer$treatment == colnames(mean_value)[j]] =mean_value[i,j]
+ }
+ } #填补缺失值
> summary(cancer)
id sur_days type treatment
- Min. : 1.0 Min. : 100.0 colon:320 chemo:497
1st Qu.: 250.8 1st Qu.: 335.8 liver:330 sugr :503
Median : 500.5 Median : 547.7 lung :350
Mean : 500.5 Mean : 545.4
3rd Qu.: 750.2 3rd Qu.: 738.5
- Max. :1000.0 Max. :1000.0
> describe(cancer)
vars n mean sd median trimmed mad min max range
id 1 1000 500.50 288.82 500.50 500.50 370.65 1 1000 999
sur_days 2 1000 545.45 247.96 547.74 544.41 300.23 100 1000 900
type* 3 1000 2.03 0.82 2.00 2.04 1.48 1 3 2
treatment* 4 1000 1.50 0.50 2.00 1.50 0.00 1 2 1
skew kurtosis se
id 0.00 -1.20 9.13
sur_days 0.03 -1.00 7.84
type* -0.06 -1.51 0.03
treatment* -0.01 -2.00 0.02
n 用R包对缺失值进行填补
u 安装mlbench包
l 示例:
install.packages('mlbench')
library(mlbench)
data('BostonHousing')
original_data <- BostonHousing
set.seed(2018)
BostonHousing[sample(1:nrow(BostonHousing),80),'rad'] <- NA #在rad列生成缺失值
BostonHousing[sample(1:nrow(BostonHousing),80),'ptratio'] <- NA #在ptratio列生成缺失值
u 安装mice包,利用里面的md.pattern()函数查看缺失值得模式。md.pattern(BostonHousing数据集名称) ,0代表缺失,1表示不缺失,最后一行表示缺失总数。
u 安装Hmisc包,利用其中的impute()插补:参数设置:需要插补的对象,拟插补的参数
n 示例:
im_mean <- impute(BostonHousing$ptratio,mean) #mean可以换成median或者固定的数字
BostonHousing$ptratio <- NULL
BostonHousing$im_mean <- im_mean
u 用mice包(链式方程多元插值)进行缺失值填补,基本思想为利用一种模型,把缺失值的变量作为因变量,把其他不缺失的变量作为自变量进行回归分析,以预测缺失值
n 示例:
mice_mod <- mice(BostonHousing[,!names(BostonHousing)%in% 'medv'],method ='rf') #rf 随机森林
mice_output <- complete(mice_mod) #complete函数,完整显示
anyNA(mice_output) #是否还有NA值
FALSE
n 示例2:查看预测的精度
actuals <- original_data$rad[is.na(BostonHousing$rad)] #查看原始值
predicts <- mice_out[is.na(BostonHousing$rad),'rad'] #查看预测值
mean(actuals != predicts) #原始值不等于预测值的比例
0.3 #预测精度为70%
u VIM包:可以对缺失值进行可视化;也可以对数据进行插补
n 数据可视化:
u 示例:
md.pattern(airquality)
aggr_plot <- aggr(airquality,col = c('red','green'),numbers = TRUE,sortVars = T,labels = names(airquality),cex.axis = 0.7,gap = 3)
marginplot(airquality[1:2]
n 用线性回归模型进行插补:
n 示例:
data(sleep)
head(sleep)
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
1 6654.000 5712.0 NA NA 3.3 38.6 645 3 5 3
2 1.000 6.6 6.3 2.0 8.3 4.5 42 3 1 3
3 3.385 44.5 NA NA 12.5 14.0 60 1 1 1
4 0.920 5.7 NA NA 16.5 NA 25 5 2 3
5 2547.000 4603.0 2.1 1.8 3.9 69.0 624 3 5 4
6 10.550 179.5 9.1 0.7 9.8 27.0 180 4 4 4
sleepIm <- regressionImp(Sleep + Gest + Span + Dream + NonD ~ BodyWgt+BrainWgt,data = sleep) # 第一个为缺失值变量,~后面为不包含缺失值的变量,data为数据集名,也可以再考虑family 参数,可选auto
head(sleepIm) #不含有缺失值了,TRUE代表该位置原来为缺失值。
BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger
1 6654.000 5712.0 -11.732867 -0.6897314 3.3 38.60000 645 3 5 3
2 1.000 6.6 6.300000 2.0000000 8.3 4.50000 42 3 1 3
3 3.385 44.5 8.987353 2.0132372 12.5 14.00000 60 1 1 1
4 0.920 5.7 9.017324 2.0148478 16.5 15.50179 25 5 2 3
5 2547.000 4603.0 2.100000 1.8000000 3.9 69.00000 624 3 5 4
6 10.550 179.5 9.100000 0.7000000 9.8 27.00000 180 4 4 4
Sleep_imp Gest_imp Span_imp Dream_imp NonD_imp
1 FALSE FALSE FALSE TRUE TRUE
2 FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE TRUE TRUE
4 FALSE FALSE TRUE TRUE TRUE
5 FALSE FALSE FALSE FALSE FALSE
6 FALSE FALSE FALSE FALSE FALSE
异常值与重复值的处理
n 异常值的处理:原则:根据数据的实际背景来判断
u 一些基础值:
> set.seed(2017)
> mmhg <- sample(60:250,1000, replace = TRUE)
> range(mmhg)
[1] 60 250
> min(mmhg)
[1] 60
> max(mmhg)
[1] 250
> quantile(mmhg) #四分位数
0% 25% 50% 75% 100%
60.00 104.75 154.00 199.00 250.00
> fivenum(mmhg) #四分位数
[1] 60.0 104.5 154.0 199.0 250.0
u 自定义函数:
> outlierKD <- function(dt,var){
+ var_name <-eval(substitute(var),eval(dt))
+ tot <- sum(!is.na(var_name))
+ na1 <- sum(is.na(var_name))
+ m1 <- mean(var_name,na.rm = T)
+ par(mfrow = c(2,2),oma = c(0,0,3,0))
+ boxplot(var_name,main ='With outliers')
+ hist(var_name,main ='With outliers',xlab = NA,ylab = NA)
+ outlier <- var_name[var_name >230]
+ mo <- mean(outlier)
+ var_name <- ifelse(var_name %in% outlier, NA, var_name)
+ boxplot(var_name,main ='Without outliers')
+ hist(var_name,main ='Without outliers',xlab = NA,ylab = NA)
+ title('Outlier Check',outer = T)
+ na2 <- sum(is.na(var_name))
+ cat('Outliers identified:',na2 - na1,' ')
+ cat('Propotion (%) of outliers:',round((na2 - na1)/ tot * 100,1),' ')
+ cat('Mean of the outliers:',round(mo,2),' ')
+ m2 <- mean(var_name,na.rm = T)
+ cat('Mean without removing outliers:',round(m1,2),' ')
+ cat('Mean if we remove outliers:',round(m2,2),' ')
+ response <- readline(prompt = 'Do you want to remove outliers
+ and to replace with NA? [yes/no]:')
+ if (response == 'y' | response == 'yes'){
+ dt[as.character(substitute(var))] <- invisible(var_name)
+ assign(as.character(as.list(match.call())$dt),dt,envir = .GlobalEnv)
+ cat('Outliers successfully removed',' ')
+ return(invisible(dt))
+ } else{
+ cat('Nothing changed',' ')
+ return(invisible(var_name))
+ }
+ }
> set.seed(2017)
> df <- data.frame(bp = c(sample(80:250,1000,replace = T),NA,390,100))
> outlierKD(df,bp)
Outliers identified: 126
Propotion (%) of outliers: 12.6
Mean of the outliers: NA
Mean without removing outliers: 163.79
Mean if we remove outliers: 152.71
Do you want to remove outliers
and to replace with NA? [yes/no]:
同时也会生成可视化的图。
n 重复值的处理:
u unique()函数:返回值 针对向量
> x<- c(1,2,3,4,5,1,2,3)
> unique(x)
[1] 1 2 3 4 5
u duplicated()函数:返回逻辑值T和F,可用于提取子集 针对向量
> x<- c(1,2,3,4,5,1,2,3)
> duplicated(x)
[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
> x[duplicated(x)] #如要返回非重复值,前面加!
[1] 1 2 3
u anyDuplicated()函数:返回第一个重复值出现的位置 针对向量
> anyDuplicated(x)
[1] 6
u 对数据框进行重复值的处理
l 自己设置条件
library(readxl)
mydata <- read_excel('absolute path/filename.xlsx')
mydata[!(duplicated(mydata$name)& duplicated(mydata$birthday)),]
l 用函数实现:paste()函数:主要是对字符串进行操作
library(readxl)
mydata <- read_excel('absolute path/filename.xlsx')
mydata$test <- paste(mydata$name,mydata$birthday) # 把两个变量粘在一起
newdata <- mydata[!duplicated(mydata$test),]