Use Caret to do the feature selection for rf.
train_row <- read.csv('E:/kuaipan/Kaggle Project/Bike Sharing Demand/train.csv')
test_row <- read.csv('e:/kuaipan/Kaggle Project/Bike Sharing Demand/test.csv')
train_row$type = 'train'
test_row$type = 'test'
colnames(train_row)
test_row$casual=NA
test_row$registered=NA
test_row$count=NA
row_df = rbind(train_row, test_row)
row_df_backup = row_df
for(col in c('casual','registered','count'))
{
newcol = paste(col,'_log',sep="")
row_df[newcol] = log(row_df[col] + 1)
}
if(!require('lubridate'))
{
install.packages(lubridate)
}
row_df$dt = ymd_hms(row_df$datetime)
row_df$day = day(row_df$dt)
row_df$month = month(row_df$dt)
row_df$year = year(row_df$dt)
row_df$hour = hour(row_df$dt)
row_df$weekday =wday(row_df$dt)
row_df$week = week(row_df$dt)
for(s in c(1,2,3,4))
{
row_df[which(row_df$season==s & row_df$type=='train'),'season_count']=sum(row_df[which(row_df$season==s & row_df$type=='train'),'count'])
}
#head(row_df)
SetWorkingDay = function(year,month,day,value){
row_df[which(row_df$year %in% year & row_df$month %in% month & row_df$day %in% day),'workingday'] = value
if(value == 1){
b = 0
}else{
b = 1
}
row_df[which(row_df$year %in% year & row_df$month %in% month & row_df$day %in% day),'holiday'] = b
}
SetWorkingDay(2011,4,15,1)
SetWorkingDay(2012,4,16,1)
SetWorkingDay(2011,11,25,0)
SetWorkingDay(2012,11,23,0)
row_df[which(row_df$year == 2011 &row_df$month ==11 & row_df$day ==25),'holiday']=1
row_df[which(row_df$year == 2012 &row_df$month ==11 & row_df$day ==23),'holiday']=1
row_df[which(row_df$year == 2012 &row_df$month ==5 & row_df$day ==21),'holiday']=1
row_df[which(row_df$year == 2012 &row_df$month ==6 & row_df$day ==1),'holiday']=1
row_df[which(row_df$year == 2012 &row_df$month ==10 & row_df$day ==30),'holiday']=1
row_df[which(row_df$month ==12 & row_df$day %in% c(24,26,31)),'holiday']=1
row_df[which(row_df$month ==12 & row_df$day %in% c(24,31)),'workingday']=1
row_df[which(row_df$workingday==1 & row_df$hour %in% c(8,17,18,12)),'peak']=1
row_df[which(row_df$workingday==0 & 10<=row_df$hour<=19),'peak']=1
row_df[which(is.na(row_df$peak)),'peak']=0
row_df[which(row_df$temp>27 & row_df$windspeed <30),'ideal']=1
row_df[which(is.na(row_df$ideal)),'ideal']=0
row_df[which(row_df$humidity>=60&row_df$workingday==1),'sticky']=1
row_df[which(is.na(row_df$sticky)),'sticky']=0
row_df.train = row_df[which(row_df$type == 'train'),]
row_df.test = row_df[which(row_df$type == 'test'),]
library(Metrics)
get_rmsle =function(pred, actual){
rs = rmsle(log(pred+1),log(actual+1))
sqrt((exp(rs)))
}
library(caret)
#use ten-fold cross validation
control=rfeControl(functions=rfFuncs, method="cv", number=10)
row_df.features = row_df.train[,-c(10,11,12,14,15,16,1,17,13)]
#Feature dataset and result dataset must be same dataset
#By default the rfe will add a subset contain all the features.
result=rfe(row_df.train[,-c(10,11,12,14,15,16,1,17,13)],row_df.train[,14],size=c(16:17),rfeControl = control)
plot(result, type=c('p','l'))
#the final rf model is result$fit