赛题链接: https://tianchi.aliyun.com/competition/entrance/231593/introduction
天池的这个比赛用于学习数据挖掘是一个很好的例子,该题核心是对特征的处理与提取。
从大神代码里习得了xgboost, 学习了特征处理的精髓后,训练调优了一版GBDT。 效果还不错:
最好成绩101:
数据处理部分:
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Dec 4 14:30:36 2019 4 5 @author: wenzhe.tian 6 7 8 本赛题提供用户在2016年1月1日至2016年6月30日之间真实线上线下消费行为,预测用户在2016年7月领取优惠券后15天以内的使用情况。 9 注意: 为了保护用户和商家的隐私,所有数据均作匿名处理,同时采用了有偏采样和必要过滤。 10 11 用户消费日期特征添加:法定节假日? 12 13 """ 14 15 import pandas as pd 16 import numpy as np 17 from datetime import date 18 19 # 数据导入 20 off_train = pd.read_csv('data/ccf_offline_stage1_train.csv') 21 off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date'] 22 23 off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv') 24 off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received'] 25 26 on_train = pd.read_csv('data/ccf_online_stage1_train.csv') 27 on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date'] 28 29 # 日期格式处理 30 off_train['date']= off_train['date'].fillna('null') 31 off_train['date_received']= off_train['date_received'].fillna('null') 32 off_train.date= off_train.date.astype('str') 33 off_train.date_received= off_train.date_received.astype('str') 34 35 # 时间差分 3个预测模型 36 dataset3 = off_test 37 feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))] 38 dataset2 = off_train[(off_train.date_received>='20160515')&(off_train.date_received<='20160615')] 39 feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))] 40 dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')] 41 feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))] 42 43 44 ############# other feature ##################3 45 """ 46 5. other feature: 47 this_month_user_receive_all_coupon_count 48 this_month_user_receive_same_coupon_count 49 this_month_user_receive_same_coupon_lastone 50 this_month_user_receive_same_coupon_firstone 51 this_day_user_receive_all_coupon_count 52 this_day_user_receive_same_coupon_count 53 day_gap_before, day_gap_after (receive the same coupon) 54 """ 55 56 #for dataset3 (标签) 需写出目的 57 # 按照用户id 进行重组统计齐获得优惠券的数目 58 t = dataset3[['user_id']] 59 t['this_month_user_receive_all_coupon_count'] = 1 60 t = t.groupby('user_id').agg('sum').reset_index() 61 62 # 按照用户id和优惠券id(类型) 进行重组统计齐获得相同优惠券的数目 63 t1 = dataset3[['user_id','coupon_id']] 64 t1['this_month_user_receive_same_coupon_count'] = 1 65 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index() 66 67 #按照两个id 对收到优惠券的时间统计并按照:隔开 68 t2 = dataset3[['user_id','coupon_id','date_received']] 69 t2.date_received = t2.date_received.astype('str') 70 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 71 #统计收到的优惠券数量 72 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':'))) 73 #只要发了2张及以上的数据 74 t2 = t2[t2.receive_number>1] 75 #添加最早和最晚获取优惠券的时间数据 并只取这两个数据和两个id作为t2 76 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')])) 77 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')])) 78 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']] 79 80 # t2按照两个id 81 t3 = dataset3[['user_id','coupon_id','date_received']] 82 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left') 83 84 t3.max_date_received=t3.max_date_received.astype('float') 85 t3.date_received=t3.date_received.astype('float') 86 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received 87 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - t3.min_date_received 88 def is_firstlastone(x): 89 if x==0: 90 return 1 91 elif x>0: 92 return 0 93 else: 94 return -1 #those only receive once 95 96 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone) 97 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone) 98 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']] 99 100 t4 = dataset3[['user_id','date_received']] 101 t4['this_day_user_receive_all_coupon_count'] = 1 102 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index() 103 104 t5 = dataset3[['user_id','coupon_id','date_received']] 105 t5['this_day_user_receive_same_coupon_count'] = 1 106 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index() 107 108 t6 = dataset3[['user_id','coupon_id','date_received']] 109 t6.date_received = t6.date_received.astype('str') 110 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 111 t6.rename(columns={'date_received':'dates'},inplace=True) 112 113 def get_day_gap_before(s): 114 date_received,dates = s.split('-') 115 dates = dates.split(':') 116 gaps = [] 117 for d in dates: 118 this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days 119 if this_gap>0: 120 gaps.append(this_gap) 121 if len(gaps)==0: 122 return -1 123 else: 124 return min(gaps) 125 126 def get_day_gap_after(s): 127 date_received,dates = s.split('-') 128 dates = dates.split(':') 129 gaps = [] 130 for d in dates: 131 this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days 132 if this_gap>0: 133 gaps.append(this_gap) 134 if len(gaps)==0: 135 return -1 136 else: 137 return min(gaps) 138 139 140 t7 = dataset3[['user_id','coupon_id','date_received']] 141 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left') 142 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates 143 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before) 144 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after) 145 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']] 146 147 other_feature3 = pd.merge(t1,t,on='user_id') 148 other_feature3 = pd.merge(other_feature3,t3,on=['user_id','coupon_id']) 149 other_feature3 = pd.merge(other_feature3,t4,on=['user_id','date_received']) 150 other_feature3 = pd.merge(other_feature3,t5,on=['user_id','coupon_id','date_received']) 151 other_feature3 = pd.merge(other_feature3,t7,on=['user_id','coupon_id','date_received']) 152 other_feature3.to_csv('data/other_feature3.csv',index=None) 153 print (other_feature3.shape) 154 155 156 157 #for dataset2 158 t = dataset2[['user_id']] 159 t['this_month_user_receive_all_coupon_count'] = 1 160 t = t.groupby('user_id').agg('sum').reset_index() 161 162 t1 = dataset2[['user_id','coupon_id']] 163 t1['this_month_user_receive_same_coupon_count'] = 1 164 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index() 165 166 t2 = dataset2[['user_id','coupon_id','date_received']] 167 t2.date_received = t2.date_received.astype('str') 168 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 169 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':'))) 170 t2 = t2[t2.receive_number>1] 171 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')])) 172 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')])) 173 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']] 174 175 t3 = dataset2[['user_id','coupon_id','date_received']] 176 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left') 177 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int') 178 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received 179 def is_firstlastone(x): 180 if x==0: 181 return 1 182 elif x>0: 183 return 0 184 else: 185 return -1 #those only receive once 186 187 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone) 188 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone) 189 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']] 190 191 t4 = dataset2[['user_id','date_received']] 192 t4['this_day_user_receive_all_coupon_count'] = 1 193 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index() 194 195 t5 = dataset2[['user_id','coupon_id','date_received']] 196 t5['this_day_user_receive_same_coupon_count'] = 1 197 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index() 198 199 t6 = dataset2[['user_id','coupon_id','date_received']] 200 t6.date_received = t6.date_received.astype('str') 201 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 202 t6.rename(columns={'date_received':'dates'},inplace=True) 203 204 def get_day_gap_before(s): 205 date_received,dates = s.split('-') 206 dates = dates.split(':') 207 gaps = [] 208 for d in dates: 209 this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days 210 if this_gap>0: 211 gaps.append(this_gap) 212 if len(gaps)==0: 213 return -1 214 else: 215 return min(gaps) 216 217 def get_day_gap_after(s): 218 date_received,dates = s.split('-') 219 dates = dates.split(':') 220 gaps = [] 221 for d in dates: 222 this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days 223 if this_gap>0: 224 gaps.append(this_gap) 225 if len(gaps)==0: 226 return -1 227 else: 228 return min(gaps) 229 230 231 t7 = dataset2[['user_id','coupon_id','date_received']] 232 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left') 233 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates 234 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before) 235 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after) 236 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']] 237 238 other_feature2 = pd.merge(t1,t,on='user_id') 239 other_feature2 = pd.merge(other_feature2,t3,on=['user_id','coupon_id']) 240 other_feature2 = pd.merge(other_feature2,t4,on=['user_id','date_received']) 241 other_feature2 = pd.merge(other_feature2,t5,on=['user_id','coupon_id','date_received']) 242 other_feature2 = pd.merge(other_feature2,t7,on=['user_id','coupon_id','date_received']) 243 other_feature2.to_csv('data/other_feature2.csv',index=None) 244 print (other_feature2.shape) 245 246 247 248 #for dataset1 249 t = dataset1[['user_id']] 250 t['this_month_user_receive_all_coupon_count'] = 1 251 t = t.groupby('user_id').agg('sum').reset_index() 252 253 t1 = dataset1[['user_id','coupon_id']] 254 t1['this_month_user_receive_same_coupon_count'] = 1 255 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index() 256 257 t2 = dataset1[['user_id','coupon_id','date_received']] 258 t2.date_received = t2.date_received.astype('str') 259 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 260 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':'))) 261 t2 = t2[t2.receive_number>1] 262 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')])) 263 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')])) 264 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']] 265 266 t3 = dataset1[['user_id','coupon_id','date_received']] 267 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left') 268 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int') 269 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received 270 def is_firstlastone(x): 271 if x==0: 272 return 1 273 elif x>0: 274 return 0 275 else: 276 return -1 #those only receive once 277 278 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone) 279 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone) 280 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']] 281 282 t4 = dataset1[['user_id','date_received']] 283 t4['this_day_user_receive_all_coupon_count'] = 1 284 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index() 285 286 t5 = dataset1[['user_id','coupon_id','date_received']] 287 t5['this_day_user_receive_same_coupon_count'] = 1 288 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index() 289 290 t6 = dataset1[['user_id','coupon_id','date_received']] 291 t6.date_received = t6.date_received.astype('str') 292 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 293 t6.rename(columns={'date_received':'dates'},inplace=True) 294 295 def get_day_gap_before(s): 296 date_received,dates = s.split('-') 297 dates = dates.split(':') 298 gaps = [] 299 for d in dates: 300 this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days 301 if this_gap>0: 302 gaps.append(this_gap) 303 if len(gaps)==0: 304 return -1 305 else: 306 return min(gaps) 307 308 def get_day_gap_after(s): 309 date_received,dates = s.split('-') 310 dates = dates.split(':') 311 gaps = [] 312 for d in dates: 313 this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days 314 if this_gap>0: 315 gaps.append(this_gap) 316 if len(gaps)==0: 317 return -1 318 else: 319 return min(gaps) 320 321 322 t7 = dataset1[['user_id','coupon_id','date_received']] 323 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left') 324 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates 325 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before) 326 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after) 327 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']] 328 329 other_feature1 = pd.merge(t1,t,on='user_id') 330 other_feature1 = pd.merge(other_feature1,t3,on=['user_id','coupon_id']) 331 other_feature1 = pd.merge(other_feature1,t4,on=['user_id','date_received']) 332 other_feature1 = pd.merge(other_feature1,t5,on=['user_id','coupon_id','date_received']) 333 other_feature1 = pd.merge(other_feature1,t7,on=['user_id','coupon_id','date_received']) 334 other_feature1.to_csv('data/other_feature1.csv',index=None) 335 print (other_feature1.shape) 336 337 338 339 340 341 342 ############# coupon related feature ############# 343 """ 344 2.coupon related: 345 discount_rate. discount_man. discount_jian. is_man_jian 346 day_of_week,day_of_month. (date_received) 347 """ 348 def calc_discount_rate(s): 349 s =str(s) 350 s = s.split(':') 351 if len(s)==1: 352 return float(s[0]) 353 else: 354 return 1.0-float(s[1])/float(s[0]) 355 356 def get_discount_man(s): 357 s =str(s) 358 s = s.split(':') 359 if len(s)==1: 360 return 'null' 361 else: 362 return int(s[0]) 363 364 def get_discount_jian(s): 365 s =str(s) 366 s = s.split(':') 367 if len(s)==1: 368 return 'null' 369 else: 370 return int(s[1]) 371 372 def is_man_jian(s): 373 s =str(s) 374 s = s.split(':') 375 if len(s)==1: 376 return 0 377 else: 378 return 1 379 380 #dataset3 381 dataset3['day_of_week'] = dataset3.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1) 382 dataset3['day_of_month'] = dataset3.date_received.astype('str').apply(lambda x:int(x[6:8])) 383 dataset3['days_distance'] = dataset3.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,6,30)).days) 384 dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man) 385 dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian) 386 dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian) 387 dataset3['discount_rate'] = dataset3.discount_rate.apply(calc_discount_rate) 388 d = dataset3[['coupon_id']] 389 d['coupon_count'] = 1 390 d = d.groupby('coupon_id').agg('sum').reset_index() 391 dataset3 = pd.merge(dataset3,d,on='coupon_id',how='left') 392 dataset3.to_csv('data/coupon3_feature.csv',index=None) 393 #dataset2 394 dataset2['day_of_week'] = dataset2.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1) 395 dataset2['day_of_month'] = dataset2.date_received.astype('str').apply(lambda x:int(x[6:8])) 396 dataset2['days_distance'] = dataset2.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,5,14)).days) 397 dataset2['discount_man'] = dataset2.discount_rate.apply(get_discount_man) 398 dataset2['discount_jian'] = dataset2.discount_rate.apply(get_discount_jian) 399 dataset2['is_man_jian'] = dataset2.discount_rate.apply(is_man_jian) 400 dataset2['discount_rate'] = dataset2.discount_rate.apply(calc_discount_rate) 401 d = dataset2[['coupon_id']] 402 d['coupon_count'] = 1 403 d = d.groupby('coupon_id').agg('sum').reset_index() 404 dataset2 = pd.merge(dataset2,d,on='coupon_id',how='left') 405 dataset2.to_csv('data/coupon2_feature.csv',index=None) 406 #dataset1 407 dataset1['day_of_week'] = dataset1.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1) 408 dataset1['day_of_month'] = dataset1.date_received.astype('str').apply(lambda x:int(x[6:8])) 409 dataset1['days_distance'] = dataset1.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,4,13)).days) 410 dataset1['discount_man'] = dataset1.discount_rate.apply(get_discount_man) 411 dataset1['discount_jian'] = dataset1.discount_rate.apply(get_discount_jian) 412 dataset1['is_man_jian'] = dataset1.discount_rate.apply(is_man_jian) 413 dataset1['discount_rate'] = dataset1.discount_rate.apply(calc_discount_rate) 414 d = dataset1[['coupon_id']] 415 d['coupon_count'] = 1 416 d = d.groupby('coupon_id').agg('sum').reset_index() 417 dataset1 = pd.merge(dataset1,d,on='coupon_id',how='left') 418 dataset1.to_csv('data/coupon1_feature.csv',index=None) 419 420 421 ############# merchant related feature ############# 422 """ 423 1.merchant related: 424 total_sales. sales_use_coupon. total_coupon 425 coupon_rate = sales_use_coupon/total_sales. 426 transfer_rate = sales_use_coupon/total_coupon. 427 merchant_avg_distance,merchant_min_distance,merchant_max_distance of those use coupon 428 429 """ 430 431 #for dataset3 432 merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']] 433 434 t = merchant3[['merchant_id']] 435 t.drop_duplicates(inplace=True) 436 437 t1 = merchant3[merchant3.date!='null'][['merchant_id']] 438 t1['total_sales'] = 1 439 t1 = t1.groupby('merchant_id').agg('sum').reset_index() 440 441 merchant3.coupon_id=merchant3.coupon_id.replace(np.nan,'null') 442 t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']] 443 t2['sales_use_coupon'] = 1 444 t2 = t2.groupby('merchant_id').agg('sum').reset_index() 445 446 t3 = merchant3[merchant3.coupon_id!='null'][['merchant_id']] 447 t3['total_coupon'] = 1 448 t3 = t3.groupby('merchant_id').agg('sum').reset_index() 449 450 t4 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id','distance']] 451 t4.replace('null',-1,inplace=True) 452 #t4.distance = t4.distance.astype('int') 453 t4.replace(-1,np.nan,inplace=True) 454 t5 = t4.groupby('merchant_id').agg('min').reset_index() 455 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True) 456 457 t6 = t4.groupby('merchant_id').agg('max').reset_index() 458 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True) 459 460 t7 = t4.groupby('merchant_id').agg('mean').reset_index() 461 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True) 462 463 t8 = t4.groupby('merchant_id').agg('median').reset_index() 464 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True) 465 466 merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left') 467 merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left') 468 merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left') 469 merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left') 470 merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left') 471 merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left') 472 merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left') 473 merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0 474 merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_coupon 475 merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales 476 merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0) #fillna with 0 477 merchant3_feature.to_csv('data/merchant3_feature.csv',index=None) 478 479 480 #for dataset2 481 merchant2 = feature2[['merchant_id','coupon_id','distance','date_received','date']] 482 483 t = merchant2[['merchant_id']] 484 t.drop_duplicates(inplace=True) 485 486 t1 = merchant2[merchant2.date!='null'][['merchant_id']] 487 t1['total_sales'] = 1 488 t1 = t1.groupby('merchant_id').agg('sum').reset_index() 489 490 merchant2.coupon_id=merchant2.coupon_id.replace(np.nan,'null') 491 t2 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id']] 492 t2['sales_use_coupon'] = 1 493 t2 = t2.groupby('merchant_id').agg('sum').reset_index() 494 495 t3 = merchant2[merchant2.coupon_id!='null'][['merchant_id']] 496 t3['total_coupon'] = 1 497 t3 = t3.groupby('merchant_id').agg('sum').reset_index() 498 499 t4 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id','distance']] 500 t4.replace('null',-1,inplace=True) 501 #t4.distance = t4.distance.astype('int') 502 t4.replace(-1,np.nan,inplace=True) 503 t5 = t4.groupby('merchant_id').agg('min').reset_index() 504 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True) 505 506 t6 = t4.groupby('merchant_id').agg('max').reset_index() 507 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True) 508 509 t7 = t4.groupby('merchant_id').agg('mean').reset_index() 510 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True) 511 512 t8 = t4.groupby('merchant_id').agg('median').reset_index() 513 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True) 514 515 merchant2_feature = pd.merge(t,t1,on='merchant_id',how='left') 516 merchant2_feature = pd.merge(merchant2_feature,t2,on='merchant_id',how='left') 517 merchant2_feature = pd.merge(merchant2_feature,t3,on='merchant_id',how='left') 518 merchant2_feature = pd.merge(merchant2_feature,t5,on='merchant_id',how='left') 519 merchant2_feature = pd.merge(merchant2_feature,t6,on='merchant_id',how='left') 520 merchant2_feature = pd.merge(merchant2_feature,t7,on='merchant_id',how='left') 521 merchant2_feature = pd.merge(merchant2_feature,t8,on='merchant_id',how='left') 522 merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0 523 merchant2_feature['merchant_coupon_transfer_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_coupon 524 merchant2_feature['coupon_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_sales 525 merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(np.nan,0) #fillna with 0 526 merchant2_feature.to_csv('data/merchant2_feature.csv',index=None) 527 528 #for dataset1 529 merchant1 = feature1[['merchant_id','coupon_id','distance','date_received','date']] 530 531 t = merchant1[['merchant_id']] 532 t.drop_duplicates(inplace=True) 533 534 t1 = merchant1[merchant1.date!='null'][['merchant_id']] 535 t1['total_sales'] = 1 536 t1 = t1.groupby('merchant_id').agg('sum').reset_index() 537 538 merchant1.coupon_id=merchant1.coupon_id.replace(np.nan,'null') 539 t2 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id']] 540 t2['sales_use_coupon'] = 1 541 t2 = t2.groupby('merchant_id').agg('sum').reset_index() 542 543 t3 = merchant1[merchant1.coupon_id!='null'][['merchant_id']] 544 t3['total_coupon'] = 1 545 t3 = t3.groupby('merchant_id').agg('sum').reset_index() 546 547 t4 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id','distance']] 548 t4.replace('null',-1,inplace=True) 549 #t4.distance = t4.distance.astype('int') 550 t4.replace(-1,np.nan,inplace=True) 551 t5 = t4.groupby('merchant_id').agg('min').reset_index() 552 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True) 553 554 t6 = t4.groupby('merchant_id').agg('max').reset_index() 555 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True) 556 557 t7 = t4.groupby('merchant_id').agg('mean').reset_index() 558 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True) 559 560 t8 = t4.groupby('merchant_id').agg('median').reset_index() 561 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True) 562 563 564 merchant1_feature = pd.merge(t,t1,on='merchant_id',how='left') 565 merchant1_feature = pd.merge(merchant1_feature,t2,on='merchant_id',how='left') 566 merchant1_feature = pd.merge(merchant1_feature,t3,on='merchant_id',how='left') 567 merchant1_feature = pd.merge(merchant1_feature,t5,on='merchant_id',how='left') 568 merchant1_feature = pd.merge(merchant1_feature,t6,on='merchant_id',how='left') 569 merchant1_feature = pd.merge(merchant1_feature,t7,on='merchant_id',how='left') 570 merchant1_feature = pd.merge(merchant1_feature,t8,on='merchant_id',how='left') 571 merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0 572 merchant1_feature['merchant_coupon_transfer_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_coupon 573 merchant1_feature['coupon_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_sales 574 merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(np.nan,0) #fillna with 0 575 merchant1_feature.to_csv('data/merchant1_feature.csv',index=None) 576 577 578 ############# user related feature ############# 579 """ 580 3.user related: 581 count_merchant. 582 user_avg_distance, user_min_distance,user_max_distance. 583 buy_use_coupon. buy_total. coupon_received. 584 buy_use_coupon/coupon_received. 585 buy_use_coupon/buy_total 586 user_date_datereceived_gap 587 588 589 """ 590 591 def get_user_date_datereceived_gap(s): 592 s = s.split(':') 593 return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days 594 595 #for dataset3 596 user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']] 597 598 t = user3[['user_id']] 599 t.drop_duplicates(inplace=True) 600 601 t1 = user3[user3.date!='null'][['user_id','merchant_id']] 602 t1.drop_duplicates(inplace=True) 603 t1.merchant_id = 1 604 t1 = t1.groupby('user_id').agg('sum').reset_index() 605 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True) 606 user3['coupon_id']=user3['coupon_id'].replace(np.nan,'null') 607 t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']] 608 t2.replace('null',-1,inplace=True) 609 #t2.distance = t2.distance.astype('int') 610 t2.replace(-1,np.nan,inplace=True) 611 t3 = t2.groupby('user_id').agg('min').reset_index() 612 t3.rename(columns={'distance':'user_min_distance'},inplace=True) 613 614 t4 = t2.groupby('user_id').agg('max').reset_index() 615 t4.rename(columns={'distance':'user_max_distance'},inplace=True) 616 617 t5 = t2.groupby('user_id').agg('mean').reset_index() 618 t5.rename(columns={'distance':'user_mean_distance'},inplace=True) 619 620 t6 = t2.groupby('user_id').agg('median').reset_index() 621 t6.rename(columns={'distance':'user_median_distance'},inplace=True) 622 623 t7 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id']] 624 t7['buy_use_coupon'] = 1 625 t7 = t7.groupby('user_id').agg('sum').reset_index() 626 627 t8 = user3[user3.date!='null'][['user_id']] 628 t8['buy_total'] = 1 629 t8 = t8.groupby('user_id').agg('sum').reset_index() 630 631 t9 = user3[user3.coupon_id!='null'][['user_id']] 632 t9['coupon_received'] = 1 633 t9 = t9.groupby('user_id').agg('sum').reset_index() 634 635 t10 = user3[(user3.date_received!='null')&(user3.date!='null')][['user_id','date_received','date']] 636 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received 637 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap) 638 t10 = t10[['user_id','user_date_datereceived_gap']] 639 640 t11 = t10.groupby('user_id').agg('mean').reset_index() 641 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True) 642 t12 = t10.groupby('user_id').agg('min').reset_index() 643 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True) 644 t13 = t10.groupby('user_id').agg('max').reset_index() 645 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True) 646 647 648 user3_feature = pd.merge(t,t1,on='user_id',how='left') 649 user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left') 650 user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left') 651 user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left') 652 user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left') 653 user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left') 654 user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left') 655 user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left') 656 user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left') 657 user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left') 658 user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left') 659 user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0) 660 user3_feature.buy_use_coupon = user3_feature.buy_use_coupon.replace(np.nan,0) 661 user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float') 662 user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float') 663 user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0) 664 user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0) 665 user3_feature.to_csv('data/user3_feature.csv',index=None) 666 667 668 #for dataset2 669 user2 = feature2[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']] 670 671 t = user2[['user_id']] 672 t.drop_duplicates(inplace=True) 673 674 t1 = user2[user2.date!='null'][['user_id','merchant_id']] 675 t1.drop_duplicates(inplace=True) 676 t1.merchant_id = 1 677 t1 = t1.groupby('user_id').agg('sum').reset_index() 678 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True) 679 user2['coupon_id']=user3['coupon_id'].replace(np.nan,'null') 680 t2 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id','distance']] 681 t2.replace('null',-1,inplace=True) 682 #t2.distance = t2.distance.astype('int') 683 t2.replace(-1,np.nan,inplace=True) 684 t3 = t2.groupby('user_id').agg('min').reset_index() 685 t3.rename(columns={'distance':'user_min_distance'},inplace=True) 686 687 t4 = t2.groupby('user_id').agg('max').reset_index() 688 t4.rename(columns={'distance':'user_max_distance'},inplace=True) 689 690 t5 = t2.groupby('user_id').agg('mean').reset_index() 691 t5.rename(columns={'distance':'user_mean_distance'},inplace=True) 692 693 t6 = t2.groupby('user_id').agg('median').reset_index() 694 t6.rename(columns={'distance':'user_median_distance'},inplace=True) 695 696 t7 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id']] 697 t7['buy_use_coupon'] = 1 698 t7 = t7.groupby('user_id').agg('sum').reset_index() 699 700 t8 = user2[user2.date!='null'][['user_id']] 701 t8['buy_total'] = 1 702 t8 = t8.groupby('user_id').agg('sum').reset_index() 703 704 t9 = user2[user2.coupon_id!='null'][['user_id']] 705 t9['coupon_received'] = 1 706 t9 = t9.groupby('user_id').agg('sum').reset_index() 707 708 t10 = user2[(user2.date_received!='null')&(user2.date!='null')][['user_id','date_received','date']] 709 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received 710 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap) 711 t10 = t10[['user_id','user_date_datereceived_gap']] 712 713 t11 = t10.groupby('user_id').agg('mean').reset_index() 714 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True) 715 t12 = t10.groupby('user_id').agg('min').reset_index() 716 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True) 717 t13 = t10.groupby('user_id').agg('max').reset_index() 718 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True) 719 720 user2_feature = pd.merge(t,t1,on='user_id',how='left') 721 user2_feature = pd.merge(user2_feature,t3,on='user_id',how='left') 722 user2_feature = pd.merge(user2_feature,t4,on='user_id',how='left') 723 user2_feature = pd.merge(user2_feature,t5,on='user_id',how='left') 724 user2_feature = pd.merge(user2_feature,t6,on='user_id',how='left') 725 user2_feature = pd.merge(user2_feature,t7,on='user_id',how='left') 726 user2_feature = pd.merge(user2_feature,t8,on='user_id',how='left') 727 user2_feature = pd.merge(user2_feature,t9,on='user_id',how='left') 728 user2_feature = pd.merge(user2_feature,t11,on='user_id',how='left') 729 user2_feature = pd.merge(user2_feature,t12,on='user_id',how='left') 730 user2_feature = pd.merge(user2_feature,t13,on='user_id',how='left') 731 user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan,0) 732 user2_feature.buy_use_coupon = user2_feature.buy_use_coupon.replace(np.nan,0) 733 user2_feature['buy_use_coupon_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.buy_total.astype('float') 734 user2_feature['user_coupon_transfer_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.coupon_received.astype('float') 735 user2_feature.buy_total = user2_feature.buy_total.replace(np.nan,0) 736 user2_feature.coupon_received = user2_feature.coupon_received.replace(np.nan,0) 737 user2_feature.to_csv('data/user2_feature.csv',index=None) 738 739 740 #for dataset1 741 user1 = feature1[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']] 742 743 t = user1[['user_id']] 744 t.drop_duplicates(inplace=True) 745 746 t1 = user1[user1.date!='null'][['user_id','merchant_id']] 747 t1.drop_duplicates(inplace=True) 748 t1.merchant_id = 1 749 t1 = t1.groupby('user_id').agg('sum').reset_index() 750 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True) 751 user1['coupon_id']=user3['coupon_id'].replace(np.nan,'null') 752 t2 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id','distance']] 753 t2.replace('null',-1,inplace=True) 754 #t2.distance = t2.distance.astype('int') 755 t2.replace(-1,np.nan,inplace=True) 756 t3 = t2.groupby('user_id').agg('min').reset_index() 757 t3.rename(columns={'distance':'user_min_distance'},inplace=True) 758 759 t4 = t2.groupby('user_id').agg('max').reset_index() 760 t4.rename(columns={'distance':'user_max_distance'},inplace=True) 761 762 t5 = t2.groupby('user_id').agg('mean').reset_index() 763 t5.rename(columns={'distance':'user_mean_distance'},inplace=True) 764 765 t6 = t2.groupby('user_id').agg('median').reset_index() 766 t6.rename(columns={'distance':'user_median_distance'},inplace=True) 767 768 t7 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id']] 769 t7['buy_use_coupon'] = 1 770 t7 = t7.groupby('user_id').agg('sum').reset_index() 771 772 t8 = user1[user1.date!='null'][['user_id']] 773 t8['buy_total'] = 1 774 t8 = t8.groupby('user_id').agg('sum').reset_index() 775 776 t9 = user1[user1.coupon_id!='null'][['user_id']] 777 t9['coupon_received'] = 1 778 t9 = t9.groupby('user_id').agg('sum').reset_index() 779 780 t10 = user1[(user1.date_received!='null')&(user1.date!='null')][['user_id','date_received','date']] 781 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received 782 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap) 783 t10 = t10[['user_id','user_date_datereceived_gap']] 784 785 t11 = t10.groupby('user_id').agg('mean').reset_index() 786 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True) 787 t12 = t10.groupby('user_id').agg('min').reset_index() 788 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True) 789 t13 = t10.groupby('user_id').agg('max').reset_index() 790 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True) 791 792 user1_feature = pd.merge(t,t1,on='user_id',how='left') 793 user1_feature = pd.merge(user1_feature,t3,on='user_id',how='left') 794 user1_feature = pd.merge(user1_feature,t4,on='user_id',how='left') 795 user1_feature = pd.merge(user1_feature,t5,on='user_id',how='left') 796 user1_feature = pd.merge(user1_feature,t6,on='user_id',how='left') 797 user1_feature = pd.merge(user1_feature,t7,on='user_id',how='left') 798 user1_feature = pd.merge(user1_feature,t8,on='user_id',how='left') 799 user1_feature = pd.merge(user1_feature,t9,on='user_id',how='left') 800 user1_feature = pd.merge(user1_feature,t11,on='user_id',how='left') 801 user1_feature = pd.merge(user1_feature,t12,on='user_id',how='left') 802 user1_feature = pd.merge(user1_feature,t13,on='user_id',how='left') 803 user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan,0) 804 user1_feature.buy_use_coupon = user1_feature.buy_use_coupon.replace(np.nan,0) 805 user1_feature['buy_use_coupon_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.buy_total.astype('float') 806 user1_feature['user_coupon_transfer_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.coupon_received.astype('float') 807 user1_feature.buy_total = user1_feature.buy_total.replace(np.nan,0) 808 user1_feature.coupon_received = user1_feature.coupon_received.replace(np.nan,0) 809 user1_feature.to_csv('data/user1_feature.csv',index=None) 810 811 812 813 ################## user_merchant related feature ######################### 814 815 """ 816 4.user_merchant: 817 times_user_buy_merchant_before. 818 """ 819 #for dataset3 820 all_user_merchant = feature3[['user_id','merchant_id']] 821 all_user_merchant.drop_duplicates(inplace=True) 822 823 t = feature3[['user_id','merchant_id','date']] 824 t = t[t.date!='null'][['user_id','merchant_id']] 825 t['user_merchant_buy_total'] = 1 826 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index() 827 t.drop_duplicates(inplace=True) 828 829 t1 = feature3[['user_id','merchant_id','coupon_id']] 830 t1.coupon_id=t1.coupon_id.replace(np.nan,'null') 831 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']] 832 t1['user_merchant_received'] = 1 833 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index() 834 t1.drop_duplicates(inplace=True) 835 836 t2 = feature3[['user_id','merchant_id','date','date_received']] 837 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']] 838 t2['user_merchant_buy_use_coupon'] = 1 839 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index() 840 t2.drop_duplicates(inplace=True) 841 842 t3 = feature3[['user_id','merchant_id']] 843 t3['user_merchant_any'] = 1 844 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index() 845 t3.drop_duplicates(inplace=True) 846 847 t4 = feature3[['user_id','merchant_id','date','coupon_id']] 848 t4.coupon_id=t4.coupon_id.replace(np.nan,'null') 849 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']] 850 t4['user_merchant_buy_common'] = 1 851 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index() 852 t4.drop_duplicates(inplace=True) 853 854 user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left') 855 user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left') 856 user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left') 857 user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left') 858 user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left') 859 user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan,0) 860 user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan,0) 861 user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float') 862 user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_buy_total.astype('float') 863 user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float') / user_merchant3.user_merchant_any.astype('float') 864 user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float') / user_merchant3.user_merchant_buy_total.astype('float') 865 user_merchant3.to_csv('data/user_merchant3.csv',index=None) 866 867 #for dataset2 868 all_user_merchant = feature2[['user_id','merchant_id']] 869 all_user_merchant.drop_duplicates(inplace=True) 870 871 t = feature2[['user_id','merchant_id','date']] 872 t = t[t.date!='null'][['user_id','merchant_id']] 873 t['user_merchant_buy_total'] = 1 874 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index() 875 t.drop_duplicates(inplace=True) 876 877 t1 = feature2[['user_id','merchant_id','coupon_id']] 878 t1.coupon_id=t1.coupon_id.replace(np.nan,'null') 879 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']] 880 t1['user_merchant_received'] = 1 881 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index() 882 t1.drop_duplicates(inplace=True) 883 884 t2 = feature2[['user_id','merchant_id','date','date_received']] 885 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']] 886 t2['user_merchant_buy_use_coupon'] = 1 887 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index() 888 t2.drop_duplicates(inplace=True) 889 890 t3 = feature2[['user_id','merchant_id']] 891 t3['user_merchant_any'] = 1 892 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index() 893 t3.drop_duplicates(inplace=True) 894 895 t4 = feature2[['user_id','merchant_id','date','coupon_id']] 896 t4.coupon_id=t4.coupon_id.replace(np.nan,'null') 897 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']] 898 t4['user_merchant_buy_common'] = 1 899 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index() 900 t4.drop_duplicates(inplace=True) 901 902 user_merchant2 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left') 903 user_merchant2 = pd.merge(user_merchant2,t1,on=['user_id','merchant_id'],how='left') 904 user_merchant2 = pd.merge(user_merchant2,t2,on=['user_id','merchant_id'],how='left') 905 user_merchant2 = pd.merge(user_merchant2,t3,on=['user_id','merchant_id'],how='left') 906 user_merchant2 = pd.merge(user_merchant2,t4,on=['user_id','merchant_id'],how='left') 907 user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(np.nan,0) 908 user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(np.nan,0) 909 user_merchant2['user_merchant_coupon_transfer_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_received.astype('float') 910 user_merchant2['user_merchant_coupon_buy_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_buy_total.astype('float') 911 user_merchant2['user_merchant_rate'] = user_merchant2.user_merchant_buy_total.astype('float') / user_merchant2.user_merchant_any.astype('float') 912 user_merchant2['user_merchant_common_buy_rate'] = user_merchant2.user_merchant_buy_common.astype('float') / user_merchant2.user_merchant_buy_total.astype('float') 913 user_merchant2.to_csv('data/user_merchant2.csv',index=None) 914 915 #for dataset2 916 all_user_merchant = feature1[['user_id','merchant_id']] 917 all_user_merchant.drop_duplicates(inplace=True) 918 919 t = feature1[['user_id','merchant_id','date']] 920 t = t[t.date!='null'][['user_id','merchant_id']] 921 t['user_merchant_buy_total'] = 1 922 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index() 923 t.drop_duplicates(inplace=True) 924 925 t1 = feature1[['user_id','merchant_id','coupon_id']] 926 t1.coupon_id=t1.coupon_id.replace(np.nan,'null') 927 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']] 928 t1['user_merchant_received'] = 1 929 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index() 930 t1.drop_duplicates(inplace=True) 931 932 t2 = feature1[['user_id','merchant_id','date','date_received']] 933 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']] 934 t2['user_merchant_buy_use_coupon'] = 1 935 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index() 936 t2.drop_duplicates(inplace=True) 937 938 t3 = feature1[['user_id','merchant_id']] 939 t3['user_merchant_any'] = 1 940 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index() 941 t3.drop_duplicates(inplace=True) 942 943 t4 = feature1[['user_id','merchant_id','date','coupon_id']] 944 t4.coupon_id=t4.coupon_id.replace(np.nan,'null') 945 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']] 946 t4['user_merchant_buy_common'] = 1 947 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index() 948 t4.drop_duplicates(inplace=True) 949 950 user_merchant1 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left') 951 user_merchant1 = pd.merge(user_merchant1,t1,on=['user_id','merchant_id'],how='left') 952 user_merchant1 = pd.merge(user_merchant1,t2,on=['user_id','merchant_id'],how='left') 953 user_merchant1 = pd.merge(user_merchant1,t3,on=['user_id','merchant_id'],how='left') 954 user_merchant1 = pd.merge(user_merchant1,t4,on=['user_id','merchant_id'],how='left') 955 user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(np.nan,0) 956 user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(np.nan,0) 957 user_merchant1['user_merchant_coupon_transfer_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_received.astype('float') 958 user_merchant1['user_merchant_coupon_buy_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_buy_total.astype('float') 959 user_merchant1['user_merchant_rate'] = user_merchant1.user_merchant_buy_total.astype('float') / user_merchant1.user_merchant_any.astype('float') 960 user_merchant1['user_merchant_common_buy_rate'] = user_merchant1.user_merchant_buy_common.astype('float') / user_merchant1.user_merchant_buy_total.astype('float') 961 user_merchant1.to_csv('data/user_merchant1.csv',index=None) 962 963 ################## generate training and testing set ################ 964 def get_label(s): 965 s = s.split(':') 966 if s[0]=='nan': 967 return 0 968 elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15: 969 return 1 970 else: 971 return -1 972 973 974 coupon3 = pd.read_csv('data/coupon3_feature.csv') 975 merchant3 = pd.read_csv('data/merchant3_feature.csv') 976 user3 = pd.read_csv('data/user3_feature.csv') 977 user_merchant3 = pd.read_csv('data/user_merchant3.csv') 978 other_feature3 = pd.read_csv('data/other_feature3.csv') 979 dataset3 = pd.merge(coupon3,merchant3,on='merchant_id',how='left') 980 dataset3 = pd.merge(dataset3,user3,on='user_id',how='left') 981 dataset3 = pd.merge(dataset3,user_merchant3,on=['user_id','merchant_id'],how='left') 982 dataset3 = pd.merge(dataset3,other_feature3,on=['user_id','coupon_id','date_received'],how='left') 983 dataset3.drop_duplicates(inplace=True) 984 print (dataset3.shape) 985 986 dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(np.nan,0) 987 dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan,0) 988 dataset3.user_merchant_received = dataset3.user_merchant_received.replace(np.nan,0) 989 dataset3['is_weekend'] = dataset3.day_of_week.apply(lambda x:1 if x in (6,7) else 0) 990 weekday_dummies = pd.get_dummies(dataset3.day_of_week) 991 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])] 992 dataset3 = pd.concat([dataset3,weekday_dummies],axis=1) 993 dataset3.drop(['merchant_id','day_of_week','coupon_count'],axis=1,inplace=True) 994 dataset3 = dataset3.replace('null',np.nan) 995 dataset3.to_csv('data/dataset3.csv',index=None) 996 997 998 coupon2 = pd.read_csv('data/coupon2_feature.csv') 999 merchant2 = pd.read_csv('data/merchant2_feature.csv') 1000 user2 = pd.read_csv('data/user2_feature.csv') 1001 user_merchant2 = pd.read_csv('data/user_merchant2.csv') 1002 other_feature2 = pd.read_csv('data/other_feature2.csv') 1003 dataset2 = pd.merge(coupon2,merchant2,on='merchant_id',how='left') 1004 dataset2 = pd.merge(dataset2,user2,on='user_id',how='left') 1005 dataset2 = pd.merge(dataset2,user_merchant2,on=['user_id','merchant_id'],how='left') 1006 dataset2 = pd.merge(dataset2,other_feature2,on=['user_id','coupon_id','date_received'],how='left') 1007 dataset2.drop_duplicates(inplace=True) 1008 print( dataset2.shape) 1009 1010 dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(np.nan,0) 1011 dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan,0) 1012 dataset2.user_merchant_received = dataset2.user_merchant_received.replace(np.nan,0) 1013 dataset2['is_weekend'] = dataset2.day_of_week.apply(lambda x:1 if x in (6,7) else 0) 1014 weekday_dummies = pd.get_dummies(dataset2.day_of_week) 1015 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])] 1016 dataset2 = pd.concat([dataset2,weekday_dummies],axis=1) 1017 dataset2['label'] = dataset2.date.astype('str') + ':' + dataset2.date_received.astype('str') 1018 dataset2.label = dataset2.label.apply(get_label) 1019 dataset2.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True) 1020 dataset2 = dataset2.replace('null',np.nan) 1021 dataset2.to_csv('data/dataset2.csv',index=None) 1022 1023 1024 coupon1 = pd.read_csv('data/coupon1_feature.csv') 1025 merchant1 = pd.read_csv('data/merchant1_feature.csv') 1026 user1 = pd.read_csv('data/user1_feature.csv') 1027 user_merchant1 = pd.read_csv('data/user_merchant1.csv') 1028 other_feature1 = pd.read_csv('data/other_feature1.csv') 1029 dataset1 = pd.merge(coupon1,merchant1,on='merchant_id',how='left') 1030 dataset1 = pd.merge(dataset1,user1,on='user_id',how='left') 1031 dataset1 = pd.merge(dataset1,user_merchant1,on=['user_id','merchant_id'],how='left') 1032 dataset1 = pd.merge(dataset1,other_feature1,on=['user_id','coupon_id','date_received'],how='left') 1033 dataset1.drop_duplicates(inplace=True) 1034 print (dataset1.shape) 1035 1036 dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(np.nan,0) 1037 dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan,0) 1038 dataset1.user_merchant_received = dataset1.user_merchant_received.replace(np.nan,0) 1039 dataset1['is_weekend'] = dataset1.day_of_week.apply(lambda x:1 if x in (6,7) else 0) 1040 weekday_dummies = pd.get_dummies(dataset1.day_of_week) 1041 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])] 1042 dataset1 = pd.concat([dataset1,weekday_dummies],axis=1) 1043 dataset1['label'] = dataset1.date.astype('str') + ':' + dataset1.date_received.astype('str') 1044 dataset1.label = dataset1.label.apply(get_label) 1045 dataset1.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True) 1046 dataset1 = dataset1.replace('null',np.nan) 1047 dataset1.to_csv('data/dataset1.csv',index=None)
GBDT训练部分:
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Mon Dec 9 13:21:52 2019 4 5 @author: wenzhe.tian 6 7 8 GBDT调参顺序 可使用GridSearchCV 9 1. n_estimators & learning_rate 10 2. max_depth 或可为1 11 3. min_samples_split & min_samples_leaf 12 4. max_features 特征较多时考虑取部分 13 5. subsample 使用样本的比例 14 """ 15 from sklearn.preprocessing import MinMaxScaler 16 import pandas as pd 17 import numpy as np 18 from sklearn import ensemble 19 from sklearn.ensemble import GradientBoostingRegressor 20 from sklearn.grid_search import GridSearchCV 21 import pickle 22 import os 23 import datetime 24 dataset1 = pd.read_csv('data/dataset1.csv') 25 dataset1.label.replace(-1,0,inplace=True) 26 dataset2 = pd.read_csv('data/dataset2.csv') 27 dataset2.label.replace(-1,0,inplace=True) 28 dataset3 = pd.read_csv('data/dataset3.csv') 29 30 dataset1.drop_duplicates(inplace=True) 31 dataset2.drop_duplicates(inplace=True) 32 dataset3.drop_duplicates(inplace=True) 33 34 dataset12 = pd.concat([dataset1,dataset2],axis=0) 35 36 dataset1_y = dataset1.label 37 dataset1_x = dataset1.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1) # 'day_gap_before','day_gap_after' cause overfitting, 0.77 38 dataset2_y = dataset2.label 39 dataset2_x = dataset2.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1) 40 dataset12_y = dataset12.label 41 dataset12_x = dataset12.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1) 42 dataset3_preds = dataset3[['user_id','coupon_id','date_received']] 43 dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1) 44 45 dataset12_x=dataset12_x.fillna(-100) 46 dataset3_x=dataset3_x.fillna(-100) 47 48 49 50 51 score=[] 52 # 深度调参 53 #gbdt=ensemble.GradientBoostingRegressor( 54 # loss='ls' 55 #, learning_rate=0.1 56 #, n_estimators=1000 57 #, subsample=1 58 #, min_samples_split=15 59 #, min_samples_leaf=10 60 #, max_depth=3 61 #, init=None 62 #, random_state=None 63 #, max_features=None 64 #, alpha=0.9 65 #, verbose=1 66 #, max_leaf_nodes=None 67 #, warm_start=False) 68 #gbdt.fit(dataset12_x,dataset12_y) 69 # 70 #score.append(gbdt.score(dataset12_x,dataset12_y)) 71 72 #dataset3_preds['label'] = gbdt.predict(dataset3_x) 73 #dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label.reshape(-1, 1)) 74 #dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True) 75 #dataset3_preds.to_csv("test.csv",index=None,header=None) 76 77 # 深度调参 78 gbdt=ensemble.GradientBoostingRegressor( 79 loss='ls' 80 , learning_rate=0.11 81 , n_estimators=160 82 , subsample=1 83 , min_samples_split=4 #4 is best 84 , min_samples_leaf=1 85 , max_depth=4 86 , init=None 87 , random_state=None 88 , max_features=None 89 , alpha=0.9 90 , verbose=1 91 , max_leaf_nodes=None 92 , warm_start=False) 93 gbdt.fit(dataset12_x,dataset12_y) 94 95 score.append(gbdt.score(dataset12_x,dataset12_y)) 96 97 98 dataset3_preds['label'] = gbdt.predict(dataset3_x) 99 dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label.reshape(-1, 1)) 100 dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True) 101 dataset3_preds.to_csv("results"+datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S')+".csv",index=None,header=None) 102 103 #os.system('shutdown -s -t 10')
GridSearchCV的一个调参案例:
param_test2 = {'n_estimators':[50,75,100,125,150,200,300,400]} # 迭代次数调参 gsearch2 = GridSearchCV( estimator=GradientBoostingRegressor( loss='ls' , learning_rate=0.1 # , n_estimators=125 , subsample=1 , min_samples_split=5 #2 , min_samples_leaf=1 #1 , max_depth=5 , init=None , random_state=None , max_features=None , alpha=0.9 , verbose=1 , max_leaf_nodes=None , warm_start=False ),param_grid = param_test2, scoring='roc_auc',iid=False,cv=None) gsearch2.fit(dataset12_x,dataset12_y) ans_vid1={} ans_vid1['grid_scores']=gsearch2.grid_scores_ ans_vid1['best_params']=gsearch2.best_params_ ans_vid1['best_score']=gsearch2.best_score_