8. 数据排序
> leadership$age [1] 32 45 25 39 NA > newdata <- leadership[order(leadership$age),] > newdata manager testDate country gender age item1 item2 item3 item4 item5 3 3 2008-10-01 UK F 25 3 5 5 5 2 1 1 2008-10-24 US M 32 5 4 5 5 5 4 4 2008-10-12 UK M 39 3 3 4 NA NA 2 2 2008-10-28 US F 45 3 5 2 5 5 5 5 2009-05-01 UK F NA 2 2 1 2 1 stringAsFactors agecat 3 FALSE Young 1 FALSE Young 4 FALSE Young 2 FALSE Young 5 FALSE <NA> > > > attach(leadership) The following objects are masked _by_ .GlobalEnv: age, country, gender, manager > newdata <- leadership[order(gender, age),] > detach(leadership) > newdata manager testDate country gender age item1 item2 item3 item4 item5 3 3 2008-10-01 UK F 25 3 5 5 5 2 2 2 2008-10-28 US F 45 3 5 2 5 5 5 5 2009-05-01 UK F NA 2 2 1 2 1 1 1 2008-10-24 US M 32 5 4 5 5 5 4 4 2008-10-12 UK M 39 3 3 4 NA NA stringAsFactors agecat 3 FALSE Young 2 FALSE Young 5 FALSE <NA> 1 FALSE Young 4 FALSE Young > > attach(leadership) The following objects are masked _by_ .GlobalEnv: age, country, gender, manager > newdata <- leadership[order(gender, -age),] > detach(leadership) > newdata manager testDate country gender age item1 item2 item3 item4 item5 5 5 2009-05-01 UK F NA 2 2 1 2 1 2 2 2008-10-28 US F 45 3 5 2 5 5 3 3 2008-10-01 UK F 25 3 5 5 5 2 4 4 2008-10-12 UK M 39 3 3 4 NA NA 1 1 2008-10-24 US M 32 5 4 5 5 5 stringAsFactors agecat 5 FALSE <NA> 2 FALSE Young 3 FALSE Young 4 FALSE Young 1 FALSE Young >
9. 数据集的合并
9.1 添加列
> patientID <- c(1, 2, 3, 4) > age <- c(25, 34, 28, 52) > status <- c("poor", "improved", "excellent", "poor") > gender <- c("F", "M", "M", "F") > dataframeA <- data.frame(patientID, gender) > dataframeA patientID gender 1 1 F 2 2 M 3 3 M 4 4 F > dataframeB <- data.frame(patientID, age, status) > dataframeB patientID age status 1 1 25 poor 2 2 34 improved 3 3 28 excellent 4 4 52 poor > total <- merge(dataframeA, dataframeB, by="ID") Error in fix.by(by.x, x) : 'by' must specify a uniquely valid column > total <- merge(dataframeA, dataframeB, by="patientID") > total patientID gender age status 1 1 F 25 poor 2 2 M 34 improved 3 3 M 28 excellent 4 4 F 52 poor > total <- merge(dataframeA, dataframeB, by=c("gender", "age")) Error in fix.by(by.x, x) : 'by' must specify a uniquely valid column > total <- merge(dataframeA, dataframeB, by=c("patientID", "age")) Error in fix.by(by.x, x) : 'by' must specify a uniquely valid column > > total <- cbind(dataframeA, dataframeB) > total patientID gender patientID age status 1 1 F 1 25 poor 2 2 M 2 34 improved 3 3 M 3 28 excellent 4 4 F 4 52 poor >
9.2 添加行
> total <- rbind(dataframeA, dataframeB) Error in rbind(deparse.level, ...) : numbers of columns of arguments do not match
10. 数据集取子集
10.1 选入(保留)变量
> newdata <- leadership[, c(6:10)] > newdata item1 item2 item3 item4 item5 1 5 4 5 5 5 2 3 5 2 5 5 3 3 5 5 5 2 4 3 3 4 NA NA 5 2 2 1 2 1 > > > myvars <- c("item1","item2","item3","item4","item5") > newdata <- leadership[myvars] > newdata item1 item2 item3 item4 item5 1 5 4 5 5 5 2 3 5 2 5 5 3 3 5 5 5 2 4 3 3 4 NA NA 5 2 2 1 2 1 > > > myvar <- paste("item", 1:5, seq="") > myvar [1] "item 1 " "item 2 " "item 3 " "item 4 " "item 5 " > myvar <- paste("item", 1:5, sep="") > myvar [1] "item1" "item2" "item3" "item4" "item5" > newdata <- leadership[myvars] > newdata item1 item2 item3 item4 item5 1 5 4 5 5 5 2 3 5 2 5 5 3 3 5 5 5 2 4 3 3 4 NA NA 5 2 2 1 2 1 >
10.2 剔除(丢弃)变量
> myvars <- names(leadership) %in% c("item3", "item4") > myvars [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE > newdata <- leadership[!myvars] > newdata manager testDate country gender age item1 item2 item5 stringAsFactors 1 1 2008-10-24 US M 32 5 4 5 FALSE 2 2 2008-10-28 US F 45 3 5 5 FALSE 3 3 2008-10-01 UK F 25 3 5 2 FALSE 4 4 2008-10-12 UK M 39 3 3 NA FALSE 5 5 2009-05-01 UK F NA 2 2 1 FALSE agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> > > > names(leadership) [1] "manager" "testDate" "country" "gender" [5] "age" "item1" "item2" "item3" [9] "item4" "item5" "stringAsFactors" "agecat" > > newdata <- leadership[c(-8,-9)] > newdata manager testDate country gender age item1 item2 item5 stringAsFactors 1 1 2008-10-24 US M 32 5 4 5 FALSE 2 2 2008-10-28 US F 45 3 5 5 FALSE 3 3 2008-10-01 UK F 25 3 5 2 FALSE 4 4 2008-10-12 UK M 39 3 3 NA FALSE 5 5 2009-05-01 UK F NA 2 2 1 FALSE agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> > leadership$item3 <- leadership$item4 <- NULL > leadership manager testDate country gender age item1 item2 item5 stringAsFactors 1 1 2008-10-24 US M 32 5 4 5 FALSE 2 2 2008-10-28 US F 45 3 5 5 FALSE 3 3 2008-10-01 UK F 25 3 5 2 FALSE 4 4 2008-10-12 UK M 39 3 3 NA FALSE 5 5 2009-05-01 UK F NA 2 2 1 FALSE agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> >
10.3 选入观测
> newdata <- leadership[1:3,] > newdata manager testDate country gender age item1 item2 item5 stringAsFactors 1 1 2008-10-24 US M 32 5 4 5 FALSE 2 2 2008-10-28 US F 45 3 5 5 FALSE 3 3 2008-10-01 UK F 25 3 5 2 FALSE agecat 1 Young 2 Young 3 Young > newdata <- leadership[which(leadership$gender=="M" & leadership$age > 30),] > newdata manager testDate country gender age item1 item2 item5 stringAsFactors 1 1 2008-10-24 US M 32 5 4 5 FALSE 4 4 2008-10-12 UK M 39 3 3 NA FALSE agecat 1 Young 4 Young > attach(leadership) The following objects are masked _by_ .GlobalEnv: age, country, gender, manager > newdata1 <- leadership[which(gender=='M' & age > 30),] > detach(leadership) > newdata1 manager testDate country gender age item1 item2 item5 stringAsFactors 2 2 2008-10-28 US F 45 3 5 5 FALSE agecat 2 Young >
> leadership$date <- as.Date(leadership$date, "%m/%d/%y") Error in as.Date.default(leadership$date, "%m/%d/%y") : do not know how to convert 'leadership$date' to class “Date” > leadership$testDate <- as.Date(leadership$testDate, "%m/%d/%y") > startdate <- as.Date("2009-01-01") > enddate <- as.Date("2009-10-31") > newdate <- leadership[which(leadership$testDate >= startdate & leadership$testDate <= enddate),] > newdate manager testDate country gender age item1 item2 item5 stringAsFactors 5 5 2009-05-01 UK F NA 2 2 1 FALSE agecat 5 <NA> >
10.4 subset() 函数
> leadership manager testDate country gender age item1 item2 item5 stringAsFactors 1 1 2008-10-24 US M 32 5 4 5 FALSE 2 2 2008-10-28 US F 45 3 5 5 FALSE 3 3 2008-10-01 UK F 25 3 5 2 FALSE 4 4 2008-10-12 UK M 39 3 3 NA FALSE 5 5 2009-05-01 UK F NA 2 2 1 FALSE agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> > newdata <- subset(leadership, age >= 35 | age < 24, select=c(item1, item2, item5)) > newdata item1 item2 item5 2 3 5 5 4 3 3 NA > > newdata <- subset(leadership, gender=="M" & age > 25, select=gender:item5) > newdata gender age item1 item2 item5 1 M 32 5 4 5 4 M 39 3 3 NA >
10.5 随机抽样
> leadership manager testDate country gender age item1 item2 item5 stringAsFactors 1 1 2008-10-24 US M 32 5 4 5 FALSE 2 2 2008-10-28 US F 45 3 5 5 FALSE 3 3 2008-10-01 UK F 25 3 5 2 FALSE 4 4 2008-10-12 UK M 39 3 3 NA FALSE 5 5 2009-05-01 UK F NA 2 2 1 FALSE agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> > > mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),] > mysample manager testDate country gender age item1 item2 item5 stringAsFactors 4 4 2008-10-12 UK M 39 3 3 NA FALSE 2 2 2008-10-28 US F 45 3 5 5 FALSE 1 1 2008-10-24 US M 32 5 4 5 FALSE agecat 4 Young 2 Young 1 Young >