• 阿里云天池新人赛o2o优惠券使用预测-------进阶心得


    赛题链接: https://tianchi.aliyun.com/competition/entrance/231593/introduction

    天池的这个比赛用于学习数据挖掘是一个很好的例子,该题核心是对特征的处理与提取。

    从大神代码里习得了xgboost, 学习了特征处理的精髓后,训练调优了一版GBDT。 效果还不错:

    最好成绩101:

    数据处理部分:

       1 # -*- coding: utf-8 -*-
       2 """
       3 Created on Wed Dec  4 14:30:36 2019
       4 
       5 @author: wenzhe.tian
       6 
       7 
       8     本赛题提供用户在2016年1月1日至2016年6月30日之间真实线上线下消费行为,预测用户在2016年7月领取优惠券后15天以内的使用情况。
       9 注意: 为了保护用户和商家的隐私,所有数据均作匿名处理,同时采用了有偏采样和必要过滤。
      10 
      11 用户消费日期特征添加:法定节假日?
      12 
      13 """
      14 
      15 import pandas as pd
      16 import numpy as np
      17 from datetime import date
      18 
      19 # 数据导入
      20 off_train = pd.read_csv('data/ccf_offline_stage1_train.csv')
      21 off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']
      22 
      23 off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
      24 off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']
      25 
      26 on_train = pd.read_csv('data/ccf_online_stage1_train.csv')
      27 on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']
      28 
      29 # 日期格式处理
      30 off_train['date']= off_train['date'].fillna('null')
      31 off_train['date_received']= off_train['date_received'].fillna('null')
      32 off_train.date= off_train.date.astype('str')
      33 off_train.date_received= off_train.date_received.astype('str')
      34 
      35 # 时间差分 3个预测模型
      36 dataset3 = off_test
      37 feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))]
      38 dataset2 = off_train[(off_train.date_received>='20160515')&(off_train.date_received<='20160615')]
      39 feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))]
      40 dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')]
      41 feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))]
      42 
      43 
      44 ############# other feature ##################3
      45 """
      46 5. other feature:
      47       this_month_user_receive_all_coupon_count
      48       this_month_user_receive_same_coupon_count
      49       this_month_user_receive_same_coupon_lastone
      50       this_month_user_receive_same_coupon_firstone
      51       this_day_user_receive_all_coupon_count
      52       this_day_user_receive_same_coupon_count
      53       day_gap_before, day_gap_after  (receive the same coupon)
      54 """
      55 
      56 #for dataset3 (标签) 需写出目的
      57 # 按照用户id 进行重组统计齐获得优惠券的数目
      58 t = dataset3[['user_id']]
      59 t['this_month_user_receive_all_coupon_count'] = 1
      60 t = t.groupby('user_id').agg('sum').reset_index()
      61 
      62 # 按照用户id和优惠券id(类型) 进行重组统计齐获得相同优惠券的数目
      63 t1 = dataset3[['user_id','coupon_id']]
      64 t1['this_month_user_receive_same_coupon_count'] = 1
      65 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
      66 
      67 #按照两个id 对收到优惠券的时间统计并按照:隔开
      68 t2 = dataset3[['user_id','coupon_id','date_received']]
      69 t2.date_received = t2.date_received.astype('str')
      70 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
      71 #统计收到的优惠券数量
      72 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
      73 #只要发了2张及以上的数据
      74 t2 = t2[t2.receive_number>1]
      75 #添加最早和最晚获取优惠券的时间数据  并只取这两个数据和两个id作为t2
      76 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
      77 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
      78 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
      79 
      80 # t2按照两个id
      81 t3 = dataset3[['user_id','coupon_id','date_received']]
      82 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
      83 
      84 t3.max_date_received=t3.max_date_received.astype('float')
      85 t3.date_received=t3.date_received.astype('float')
      86 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received
      87 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - t3.min_date_received
      88 def is_firstlastone(x):
      89     if x==0:
      90         return 1
      91     elif x>0:
      92         return 0
      93     else:
      94         return -1 #those only receive once
      95 
      96 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
      97 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
      98 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
      99 
     100 t4 = dataset3[['user_id','date_received']]
     101 t4['this_day_user_receive_all_coupon_count'] = 1
     102 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
     103 
     104 t5 = dataset3[['user_id','coupon_id','date_received']]
     105 t5['this_day_user_receive_same_coupon_count'] = 1
     106 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
     107 
     108 t6 = dataset3[['user_id','coupon_id','date_received']]
     109 t6.date_received = t6.date_received.astype('str')
     110 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
     111 t6.rename(columns={'date_received':'dates'},inplace=True)
     112 
     113 def get_day_gap_before(s):
     114     date_received,dates = s.split('-')
     115     dates = dates.split(':')
     116     gaps = []
     117     for d in dates:
     118         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
     119         if this_gap>0:
     120             gaps.append(this_gap)
     121     if len(gaps)==0:
     122         return -1
     123     else:
     124         return min(gaps)
     125 
     126 def get_day_gap_after(s):
     127     date_received,dates = s.split('-')
     128     dates = dates.split(':')
     129     gaps = []
     130     for d in dates:
     131         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
     132         if this_gap>0:
     133             gaps.append(this_gap)
     134     if len(gaps)==0:
     135         return -1
     136     else:
     137         return min(gaps)
     138 
     139 
     140 t7 = dataset3[['user_id','coupon_id','date_received']]
     141 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
     142 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
     143 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
     144 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
     145 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
     146 
     147 other_feature3 = pd.merge(t1,t,on='user_id')
     148 other_feature3 = pd.merge(other_feature3,t3,on=['user_id','coupon_id'])
     149 other_feature3 = pd.merge(other_feature3,t4,on=['user_id','date_received'])
     150 other_feature3 = pd.merge(other_feature3,t5,on=['user_id','coupon_id','date_received'])
     151 other_feature3 = pd.merge(other_feature3,t7,on=['user_id','coupon_id','date_received'])
     152 other_feature3.to_csv('data/other_feature3.csv',index=None)
     153 print (other_feature3.shape)
     154 
     155 
     156 
     157 #for dataset2
     158 t = dataset2[['user_id']]
     159 t['this_month_user_receive_all_coupon_count'] = 1
     160 t = t.groupby('user_id').agg('sum').reset_index()
     161 
     162 t1 = dataset2[['user_id','coupon_id']]
     163 t1['this_month_user_receive_same_coupon_count'] = 1
     164 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
     165 
     166 t2 = dataset2[['user_id','coupon_id','date_received']]
     167 t2.date_received = t2.date_received.astype('str')
     168 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
     169 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
     170 t2 = t2[t2.receive_number>1]
     171 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
     172 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
     173 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
     174 
     175 t3 = dataset2[['user_id','coupon_id','date_received']]
     176 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
     177 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int')
     178 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received
     179 def is_firstlastone(x):
     180     if x==0:
     181         return 1
     182     elif x>0:
     183         return 0
     184     else:
     185         return -1 #those only receive once
     186 
     187 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
     188 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
     189 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
     190 
     191 t4 = dataset2[['user_id','date_received']]
     192 t4['this_day_user_receive_all_coupon_count'] = 1
     193 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
     194 
     195 t5 = dataset2[['user_id','coupon_id','date_received']]
     196 t5['this_day_user_receive_same_coupon_count'] = 1
     197 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
     198 
     199 t6 = dataset2[['user_id','coupon_id','date_received']]
     200 t6.date_received = t6.date_received.astype('str')
     201 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
     202 t6.rename(columns={'date_received':'dates'},inplace=True)
     203 
     204 def get_day_gap_before(s):
     205     date_received,dates = s.split('-')
     206     dates = dates.split(':')
     207     gaps = []
     208     for d in dates:
     209         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
     210         if this_gap>0:
     211             gaps.append(this_gap)
     212     if len(gaps)==0:
     213         return -1
     214     else:
     215         return min(gaps)
     216 
     217 def get_day_gap_after(s):
     218     date_received,dates = s.split('-')
     219     dates = dates.split(':')
     220     gaps = []
     221     for d in dates:
     222         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
     223         if this_gap>0:
     224             gaps.append(this_gap)
     225     if len(gaps)==0:
     226         return -1
     227     else:
     228         return min(gaps)
     229 
     230 
     231 t7 = dataset2[['user_id','coupon_id','date_received']]
     232 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
     233 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
     234 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
     235 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
     236 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
     237 
     238 other_feature2 = pd.merge(t1,t,on='user_id')
     239 other_feature2 = pd.merge(other_feature2,t3,on=['user_id','coupon_id'])
     240 other_feature2 = pd.merge(other_feature2,t4,on=['user_id','date_received'])
     241 other_feature2 = pd.merge(other_feature2,t5,on=['user_id','coupon_id','date_received'])
     242 other_feature2 = pd.merge(other_feature2,t7,on=['user_id','coupon_id','date_received'])
     243 other_feature2.to_csv('data/other_feature2.csv',index=None)
     244 print (other_feature2.shape)
     245 
     246 
     247 
     248 #for dataset1
     249 t = dataset1[['user_id']]
     250 t['this_month_user_receive_all_coupon_count'] = 1
     251 t = t.groupby('user_id').agg('sum').reset_index()
     252 
     253 t1 = dataset1[['user_id','coupon_id']]
     254 t1['this_month_user_receive_same_coupon_count'] = 1
     255 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
     256 
     257 t2 = dataset1[['user_id','coupon_id','date_received']]
     258 t2.date_received = t2.date_received.astype('str')
     259 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
     260 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
     261 t2 = t2[t2.receive_number>1]
     262 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
     263 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
     264 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
     265 
     266 t3 = dataset1[['user_id','coupon_id','date_received']]
     267 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
     268 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int')
     269 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received
     270 def is_firstlastone(x):
     271     if x==0:
     272         return 1
     273     elif x>0:
     274         return 0
     275     else:
     276         return -1 #those only receive once
     277 
     278 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
     279 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
     280 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
     281 
     282 t4 = dataset1[['user_id','date_received']]
     283 t4['this_day_user_receive_all_coupon_count'] = 1
     284 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
     285 
     286 t5 = dataset1[['user_id','coupon_id','date_received']]
     287 t5['this_day_user_receive_same_coupon_count'] = 1
     288 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
     289 
     290 t6 = dataset1[['user_id','coupon_id','date_received']]
     291 t6.date_received = t6.date_received.astype('str')
     292 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
     293 t6.rename(columns={'date_received':'dates'},inplace=True)
     294 
     295 def get_day_gap_before(s):
     296     date_received,dates = s.split('-')
     297     dates = dates.split(':')
     298     gaps = []
     299     for d in dates:
     300         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
     301         if this_gap>0:
     302             gaps.append(this_gap)
     303     if len(gaps)==0:
     304         return -1
     305     else:
     306         return min(gaps)
     307 
     308 def get_day_gap_after(s):
     309     date_received,dates = s.split('-')
     310     dates = dates.split(':')
     311     gaps = []
     312     for d in dates:
     313         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
     314         if this_gap>0:
     315             gaps.append(this_gap)
     316     if len(gaps)==0:
     317         return -1
     318     else:
     319         return min(gaps)
     320 
     321 
     322 t7 = dataset1[['user_id','coupon_id','date_received']]
     323 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
     324 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
     325 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
     326 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
     327 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
     328 
     329 other_feature1 = pd.merge(t1,t,on='user_id')
     330 other_feature1 = pd.merge(other_feature1,t3,on=['user_id','coupon_id'])
     331 other_feature1 = pd.merge(other_feature1,t4,on=['user_id','date_received'])
     332 other_feature1 = pd.merge(other_feature1,t5,on=['user_id','coupon_id','date_received'])
     333 other_feature1 = pd.merge(other_feature1,t7,on=['user_id','coupon_id','date_received'])
     334 other_feature1.to_csv('data/other_feature1.csv',index=None)
     335 print (other_feature1.shape)
     336 
     337 
     338 
     339 
     340 
     341 
     342 ############# coupon related feature   #############
     343 """
     344 2.coupon related:
     345       discount_rate. discount_man. discount_jian. is_man_jian
     346       day_of_week,day_of_month. (date_received)
     347 """
     348 def calc_discount_rate(s):
     349     s =str(s)
     350     s = s.split(':')
     351     if len(s)==1:
     352         return float(s[0])
     353     else:
     354         return 1.0-float(s[1])/float(s[0])
     355 
     356 def get_discount_man(s):
     357     s =str(s)
     358     s = s.split(':')
     359     if len(s)==1:
     360         return 'null'
     361     else:
     362         return int(s[0])
     363 
     364 def get_discount_jian(s):
     365     s =str(s)
     366     s = s.split(':')
     367     if len(s)==1:
     368         return 'null'
     369     else:
     370         return int(s[1])
     371 
     372 def is_man_jian(s):
     373     s =str(s)
     374     s = s.split(':')
     375     if len(s)==1:
     376         return 0
     377     else:
     378         return 1
     379 
     380 #dataset3
     381 dataset3['day_of_week'] = dataset3.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
     382 dataset3['day_of_month'] = dataset3.date_received.astype('str').apply(lambda x:int(x[6:8]))
     383 dataset3['days_distance'] = dataset3.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,6,30)).days)
     384 dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man)
     385 dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian)
     386 dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian)
     387 dataset3['discount_rate'] = dataset3.discount_rate.apply(calc_discount_rate)
     388 d = dataset3[['coupon_id']]
     389 d['coupon_count'] = 1
     390 d = d.groupby('coupon_id').agg('sum').reset_index()
     391 dataset3 = pd.merge(dataset3,d,on='coupon_id',how='left')
     392 dataset3.to_csv('data/coupon3_feature.csv',index=None)
     393 #dataset2
     394 dataset2['day_of_week'] = dataset2.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
     395 dataset2['day_of_month'] = dataset2.date_received.astype('str').apply(lambda x:int(x[6:8]))
     396 dataset2['days_distance'] = dataset2.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,5,14)).days)
     397 dataset2['discount_man'] = dataset2.discount_rate.apply(get_discount_man)
     398 dataset2['discount_jian'] = dataset2.discount_rate.apply(get_discount_jian)
     399 dataset2['is_man_jian'] = dataset2.discount_rate.apply(is_man_jian)
     400 dataset2['discount_rate'] = dataset2.discount_rate.apply(calc_discount_rate)
     401 d = dataset2[['coupon_id']]
     402 d['coupon_count'] = 1
     403 d = d.groupby('coupon_id').agg('sum').reset_index()
     404 dataset2 = pd.merge(dataset2,d,on='coupon_id',how='left')
     405 dataset2.to_csv('data/coupon2_feature.csv',index=None)
     406 #dataset1
     407 dataset1['day_of_week'] = dataset1.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
     408 dataset1['day_of_month'] = dataset1.date_received.astype('str').apply(lambda x:int(x[6:8]))
     409 dataset1['days_distance'] = dataset1.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,4,13)).days)
     410 dataset1['discount_man'] = dataset1.discount_rate.apply(get_discount_man)
     411 dataset1['discount_jian'] = dataset1.discount_rate.apply(get_discount_jian)
     412 dataset1['is_man_jian'] = dataset1.discount_rate.apply(is_man_jian)
     413 dataset1['discount_rate'] = dataset1.discount_rate.apply(calc_discount_rate)
     414 d = dataset1[['coupon_id']]
     415 d['coupon_count'] = 1
     416 d = d.groupby('coupon_id').agg('sum').reset_index()
     417 dataset1 = pd.merge(dataset1,d,on='coupon_id',how='left')
     418 dataset1.to_csv('data/coupon1_feature.csv',index=None)
     419 
     420 
     421 ############# merchant related feature   #############
     422 """
     423 1.merchant related:
     424       total_sales. sales_use_coupon.  total_coupon
     425       coupon_rate = sales_use_coupon/total_sales.
     426       transfer_rate = sales_use_coupon/total_coupon.
     427       merchant_avg_distance,merchant_min_distance,merchant_max_distance of those use coupon
     428 
     429 """
     430 
     431 #for dataset3
     432 merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']]
     433 
     434 t = merchant3[['merchant_id']]
     435 t.drop_duplicates(inplace=True)
     436 
     437 t1 = merchant3[merchant3.date!='null'][['merchant_id']]
     438 t1['total_sales'] = 1
     439 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
     440 
     441 merchant3.coupon_id=merchant3.coupon_id.replace(np.nan,'null')
     442 t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']]
     443 t2['sales_use_coupon'] = 1
     444 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
     445 
     446 t3 = merchant3[merchant3.coupon_id!='null'][['merchant_id']]
     447 t3['total_coupon'] = 1
     448 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
     449 
     450 t4 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id','distance']]
     451 t4.replace('null',-1,inplace=True)
     452 #t4.distance = t4.distance.astype('int')
     453 t4.replace(-1,np.nan,inplace=True)
     454 t5 = t4.groupby('merchant_id').agg('min').reset_index()
     455 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
     456 
     457 t6 = t4.groupby('merchant_id').agg('max').reset_index()
     458 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
     459 
     460 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
     461 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
     462 
     463 t8 = t4.groupby('merchant_id').agg('median').reset_index()
     464 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
     465 
     466 merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left')
     467 merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left')
     468 merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left')
     469 merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left')
     470 merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left')
     471 merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left')
     472 merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left')
     473 merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
     474 merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_coupon
     475 merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
     476 merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0) #fillna with 0
     477 merchant3_feature.to_csv('data/merchant3_feature.csv',index=None)
     478 
     479 
     480 #for dataset2
     481 merchant2 = feature2[['merchant_id','coupon_id','distance','date_received','date']]
     482 
     483 t = merchant2[['merchant_id']]
     484 t.drop_duplicates(inplace=True)
     485 
     486 t1 = merchant2[merchant2.date!='null'][['merchant_id']]
     487 t1['total_sales'] = 1
     488 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
     489 
     490 merchant2.coupon_id=merchant2.coupon_id.replace(np.nan,'null')
     491 t2 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id']]
     492 t2['sales_use_coupon'] = 1
     493 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
     494 
     495 t3 = merchant2[merchant2.coupon_id!='null'][['merchant_id']]
     496 t3['total_coupon'] = 1
     497 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
     498 
     499 t4 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id','distance']]
     500 t4.replace('null',-1,inplace=True)
     501 #t4.distance = t4.distance.astype('int')
     502 t4.replace(-1,np.nan,inplace=True)
     503 t5 = t4.groupby('merchant_id').agg('min').reset_index()
     504 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
     505 
     506 t6 = t4.groupby('merchant_id').agg('max').reset_index()
     507 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
     508 
     509 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
     510 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
     511 
     512 t8 = t4.groupby('merchant_id').agg('median').reset_index()
     513 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
     514 
     515 merchant2_feature = pd.merge(t,t1,on='merchant_id',how='left')
     516 merchant2_feature = pd.merge(merchant2_feature,t2,on='merchant_id',how='left')
     517 merchant2_feature = pd.merge(merchant2_feature,t3,on='merchant_id',how='left')
     518 merchant2_feature = pd.merge(merchant2_feature,t5,on='merchant_id',how='left')
     519 merchant2_feature = pd.merge(merchant2_feature,t6,on='merchant_id',how='left')
     520 merchant2_feature = pd.merge(merchant2_feature,t7,on='merchant_id',how='left')
     521 merchant2_feature = pd.merge(merchant2_feature,t8,on='merchant_id',how='left')
     522 merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
     523 merchant2_feature['merchant_coupon_transfer_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_coupon
     524 merchant2_feature['coupon_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_sales
     525 merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(np.nan,0) #fillna with 0
     526 merchant2_feature.to_csv('data/merchant2_feature.csv',index=None)
     527 
     528 #for dataset1
     529 merchant1 = feature1[['merchant_id','coupon_id','distance','date_received','date']]
     530 
     531 t = merchant1[['merchant_id']]
     532 t.drop_duplicates(inplace=True)
     533 
     534 t1 = merchant1[merchant1.date!='null'][['merchant_id']]
     535 t1['total_sales'] = 1
     536 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
     537 
     538 merchant1.coupon_id=merchant1.coupon_id.replace(np.nan,'null')
     539 t2 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id']]
     540 t2['sales_use_coupon'] = 1
     541 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
     542 
     543 t3 = merchant1[merchant1.coupon_id!='null'][['merchant_id']]
     544 t3['total_coupon'] = 1
     545 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
     546 
     547 t4 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id','distance']]
     548 t4.replace('null',-1,inplace=True)
     549 #t4.distance = t4.distance.astype('int')
     550 t4.replace(-1,np.nan,inplace=True)
     551 t5 = t4.groupby('merchant_id').agg('min').reset_index()
     552 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
     553 
     554 t6 = t4.groupby('merchant_id').agg('max').reset_index()
     555 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
     556 
     557 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
     558 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
     559 
     560 t8 = t4.groupby('merchant_id').agg('median').reset_index()
     561 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
     562 
     563 
     564 merchant1_feature = pd.merge(t,t1,on='merchant_id',how='left')
     565 merchant1_feature = pd.merge(merchant1_feature,t2,on='merchant_id',how='left')
     566 merchant1_feature = pd.merge(merchant1_feature,t3,on='merchant_id',how='left')
     567 merchant1_feature = pd.merge(merchant1_feature,t5,on='merchant_id',how='left')
     568 merchant1_feature = pd.merge(merchant1_feature,t6,on='merchant_id',how='left')
     569 merchant1_feature = pd.merge(merchant1_feature,t7,on='merchant_id',how='left')
     570 merchant1_feature = pd.merge(merchant1_feature,t8,on='merchant_id',how='left')
     571 merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
     572 merchant1_feature['merchant_coupon_transfer_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_coupon
     573 merchant1_feature['coupon_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_sales
     574 merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(np.nan,0) #fillna with 0
     575 merchant1_feature.to_csv('data/merchant1_feature.csv',index=None)
     576 
     577 
     578 ############# user related feature   #############
     579 """
     580 3.user related:
     581       count_merchant.
     582       user_avg_distance, user_min_distance,user_max_distance.
     583       buy_use_coupon. buy_total. coupon_received.
     584       buy_use_coupon/coupon_received.
     585       buy_use_coupon/buy_total
     586       user_date_datereceived_gap
     587 
     588 
     589 """
     590 
     591 def get_user_date_datereceived_gap(s):
     592     s = s.split(':')
     593     return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days
     594 
     595 #for dataset3
     596 user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
     597 
     598 t = user3[['user_id']]
     599 t.drop_duplicates(inplace=True)
     600 
     601 t1 = user3[user3.date!='null'][['user_id','merchant_id']]
     602 t1.drop_duplicates(inplace=True)
     603 t1.merchant_id = 1
     604 t1 = t1.groupby('user_id').agg('sum').reset_index()
     605 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
     606 user3['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
     607 t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']]
     608 t2.replace('null',-1,inplace=True)
     609 #t2.distance = t2.distance.astype('int')
     610 t2.replace(-1,np.nan,inplace=True)
     611 t3 = t2.groupby('user_id').agg('min').reset_index()
     612 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
     613 
     614 t4 = t2.groupby('user_id').agg('max').reset_index()
     615 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
     616 
     617 t5 = t2.groupby('user_id').agg('mean').reset_index()
     618 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
     619 
     620 t6 = t2.groupby('user_id').agg('median').reset_index()
     621 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
     622 
     623 t7 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id']]
     624 t7['buy_use_coupon'] = 1
     625 t7 = t7.groupby('user_id').agg('sum').reset_index()
     626 
     627 t8 = user3[user3.date!='null'][['user_id']]
     628 t8['buy_total'] = 1
     629 t8 = t8.groupby('user_id').agg('sum').reset_index()
     630 
     631 t9 = user3[user3.coupon_id!='null'][['user_id']]
     632 t9['coupon_received'] = 1
     633 t9 = t9.groupby('user_id').agg('sum').reset_index()
     634 
     635 t10 = user3[(user3.date_received!='null')&(user3.date!='null')][['user_id','date_received','date']]
     636 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
     637 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
     638 t10 = t10[['user_id','user_date_datereceived_gap']]
     639 
     640 t11 = t10.groupby('user_id').agg('mean').reset_index()
     641 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
     642 t12 = t10.groupby('user_id').agg('min').reset_index()
     643 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
     644 t13 = t10.groupby('user_id').agg('max').reset_index()
     645 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
     646 
     647 
     648 user3_feature = pd.merge(t,t1,on='user_id',how='left')
     649 user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left')
     650 user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left')
     651 user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left')
     652 user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left')
     653 user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left')
     654 user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left')
     655 user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left')
     656 user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left')
     657 user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left')
     658 user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left')
     659 user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
     660 user3_feature.buy_use_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
     661 user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float')
     662 user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float')
     663 user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)
     664 user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
     665 user3_feature.to_csv('data/user3_feature.csv',index=None)
     666 
     667 
     668 #for dataset2
     669 user2 = feature2[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
     670 
     671 t = user2[['user_id']]
     672 t.drop_duplicates(inplace=True)
     673 
     674 t1 = user2[user2.date!='null'][['user_id','merchant_id']]
     675 t1.drop_duplicates(inplace=True)
     676 t1.merchant_id = 1
     677 t1 = t1.groupby('user_id').agg('sum').reset_index()
     678 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
     679 user2['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
     680 t2 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id','distance']]
     681 t2.replace('null',-1,inplace=True)
     682 #t2.distance = t2.distance.astype('int')
     683 t2.replace(-1,np.nan,inplace=True)
     684 t3 = t2.groupby('user_id').agg('min').reset_index()
     685 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
     686 
     687 t4 = t2.groupby('user_id').agg('max').reset_index()
     688 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
     689 
     690 t5 = t2.groupby('user_id').agg('mean').reset_index()
     691 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
     692 
     693 t6 = t2.groupby('user_id').agg('median').reset_index()
     694 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
     695 
     696 t7 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id']]
     697 t7['buy_use_coupon'] = 1
     698 t7 = t7.groupby('user_id').agg('sum').reset_index()
     699 
     700 t8 = user2[user2.date!='null'][['user_id']]
     701 t8['buy_total'] = 1
     702 t8 = t8.groupby('user_id').agg('sum').reset_index()
     703 
     704 t9 = user2[user2.coupon_id!='null'][['user_id']]
     705 t9['coupon_received'] = 1
     706 t9 = t9.groupby('user_id').agg('sum').reset_index()
     707 
     708 t10 = user2[(user2.date_received!='null')&(user2.date!='null')][['user_id','date_received','date']]
     709 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
     710 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
     711 t10 = t10[['user_id','user_date_datereceived_gap']]
     712 
     713 t11 = t10.groupby('user_id').agg('mean').reset_index()
     714 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
     715 t12 = t10.groupby('user_id').agg('min').reset_index()
     716 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
     717 t13 = t10.groupby('user_id').agg('max').reset_index()
     718 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
     719 
     720 user2_feature = pd.merge(t,t1,on='user_id',how='left')
     721 user2_feature = pd.merge(user2_feature,t3,on='user_id',how='left')
     722 user2_feature = pd.merge(user2_feature,t4,on='user_id',how='left')
     723 user2_feature = pd.merge(user2_feature,t5,on='user_id',how='left')
     724 user2_feature = pd.merge(user2_feature,t6,on='user_id',how='left')
     725 user2_feature = pd.merge(user2_feature,t7,on='user_id',how='left')
     726 user2_feature = pd.merge(user2_feature,t8,on='user_id',how='left')
     727 user2_feature = pd.merge(user2_feature,t9,on='user_id',how='left')
     728 user2_feature = pd.merge(user2_feature,t11,on='user_id',how='left')
     729 user2_feature = pd.merge(user2_feature,t12,on='user_id',how='left')
     730 user2_feature = pd.merge(user2_feature,t13,on='user_id',how='left')
     731 user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan,0)
     732 user2_feature.buy_use_coupon = user2_feature.buy_use_coupon.replace(np.nan,0)
     733 user2_feature['buy_use_coupon_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.buy_total.astype('float')
     734 user2_feature['user_coupon_transfer_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.coupon_received.astype('float')
     735 user2_feature.buy_total = user2_feature.buy_total.replace(np.nan,0)
     736 user2_feature.coupon_received = user2_feature.coupon_received.replace(np.nan,0)
     737 user2_feature.to_csv('data/user2_feature.csv',index=None)
     738 
     739 
     740 #for dataset1
     741 user1 = feature1[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
     742 
     743 t = user1[['user_id']]
     744 t.drop_duplicates(inplace=True)
     745 
     746 t1 = user1[user1.date!='null'][['user_id','merchant_id']]
     747 t1.drop_duplicates(inplace=True)
     748 t1.merchant_id = 1
     749 t1 = t1.groupby('user_id').agg('sum').reset_index()
     750 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
     751 user1['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
     752 t2 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id','distance']]
     753 t2.replace('null',-1,inplace=True)
     754 #t2.distance = t2.distance.astype('int')
     755 t2.replace(-1,np.nan,inplace=True)
     756 t3 = t2.groupby('user_id').agg('min').reset_index()
     757 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
     758 
     759 t4 = t2.groupby('user_id').agg('max').reset_index()
     760 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
     761 
     762 t5 = t2.groupby('user_id').agg('mean').reset_index()
     763 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
     764 
     765 t6 = t2.groupby('user_id').agg('median').reset_index()
     766 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
     767 
     768 t7 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id']]
     769 t7['buy_use_coupon'] = 1
     770 t7 = t7.groupby('user_id').agg('sum').reset_index()
     771 
     772 t8 = user1[user1.date!='null'][['user_id']]
     773 t8['buy_total'] = 1
     774 t8 = t8.groupby('user_id').agg('sum').reset_index()
     775 
     776 t9 = user1[user1.coupon_id!='null'][['user_id']]
     777 t9['coupon_received'] = 1
     778 t9 = t9.groupby('user_id').agg('sum').reset_index()
     779 
     780 t10 = user1[(user1.date_received!='null')&(user1.date!='null')][['user_id','date_received','date']]
     781 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
     782 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
     783 t10 = t10[['user_id','user_date_datereceived_gap']]
     784 
     785 t11 = t10.groupby('user_id').agg('mean').reset_index()
     786 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
     787 t12 = t10.groupby('user_id').agg('min').reset_index()
     788 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
     789 t13 = t10.groupby('user_id').agg('max').reset_index()
     790 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
     791 
     792 user1_feature = pd.merge(t,t1,on='user_id',how='left')
     793 user1_feature = pd.merge(user1_feature,t3,on='user_id',how='left')
     794 user1_feature = pd.merge(user1_feature,t4,on='user_id',how='left')
     795 user1_feature = pd.merge(user1_feature,t5,on='user_id',how='left')
     796 user1_feature = pd.merge(user1_feature,t6,on='user_id',how='left')
     797 user1_feature = pd.merge(user1_feature,t7,on='user_id',how='left')
     798 user1_feature = pd.merge(user1_feature,t8,on='user_id',how='left')
     799 user1_feature = pd.merge(user1_feature,t9,on='user_id',how='left')
     800 user1_feature = pd.merge(user1_feature,t11,on='user_id',how='left')
     801 user1_feature = pd.merge(user1_feature,t12,on='user_id',how='left')
     802 user1_feature = pd.merge(user1_feature,t13,on='user_id',how='left')
     803 user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan,0)
     804 user1_feature.buy_use_coupon = user1_feature.buy_use_coupon.replace(np.nan,0)
     805 user1_feature['buy_use_coupon_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.buy_total.astype('float')
     806 user1_feature['user_coupon_transfer_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.coupon_received.astype('float')
     807 user1_feature.buy_total = user1_feature.buy_total.replace(np.nan,0)
     808 user1_feature.coupon_received = user1_feature.coupon_received.replace(np.nan,0)
     809 user1_feature.to_csv('data/user1_feature.csv',index=None)
     810 
     811 
     812 
     813 ##################  user_merchant related feature #########################
     814 
     815 """
     816 4.user_merchant:
     817       times_user_buy_merchant_before.
     818 """
     819 #for dataset3
     820 all_user_merchant = feature3[['user_id','merchant_id']]
     821 all_user_merchant.drop_duplicates(inplace=True)
     822 
     823 t = feature3[['user_id','merchant_id','date']]
     824 t = t[t.date!='null'][['user_id','merchant_id']]
     825 t['user_merchant_buy_total'] = 1
     826 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     827 t.drop_duplicates(inplace=True)
     828 
     829 t1 = feature3[['user_id','merchant_id','coupon_id']]
     830 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
     831 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
     832 t1['user_merchant_received'] = 1
     833 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     834 t1.drop_duplicates(inplace=True)
     835 
     836 t2 = feature3[['user_id','merchant_id','date','date_received']]
     837 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
     838 t2['user_merchant_buy_use_coupon'] = 1
     839 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     840 t2.drop_duplicates(inplace=True)
     841 
     842 t3 = feature3[['user_id','merchant_id']]
     843 t3['user_merchant_any'] = 1
     844 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     845 t3.drop_duplicates(inplace=True)
     846 
     847 t4 = feature3[['user_id','merchant_id','date','coupon_id']]
     848 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
     849 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
     850 t4['user_merchant_buy_common'] = 1
     851 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     852 t4.drop_duplicates(inplace=True)
     853 
     854 user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
     855 user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left')
     856 user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left')
     857 user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left')
     858 user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left')
     859 user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan,0)
     860 user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan,0)
     861 user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float')
     862 user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
     863 user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float') / user_merchant3.user_merchant_any.astype('float')
     864 user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
     865 user_merchant3.to_csv('data/user_merchant3.csv',index=None)
     866 
     867 #for dataset2
     868 all_user_merchant = feature2[['user_id','merchant_id']]
     869 all_user_merchant.drop_duplicates(inplace=True)
     870 
     871 t = feature2[['user_id','merchant_id','date']]
     872 t = t[t.date!='null'][['user_id','merchant_id']]
     873 t['user_merchant_buy_total'] = 1
     874 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     875 t.drop_duplicates(inplace=True)
     876 
     877 t1 = feature2[['user_id','merchant_id','coupon_id']]
     878 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
     879 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
     880 t1['user_merchant_received'] = 1
     881 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     882 t1.drop_duplicates(inplace=True)
     883 
     884 t2 = feature2[['user_id','merchant_id','date','date_received']]
     885 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
     886 t2['user_merchant_buy_use_coupon'] = 1
     887 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     888 t2.drop_duplicates(inplace=True)
     889 
     890 t3 = feature2[['user_id','merchant_id']]
     891 t3['user_merchant_any'] = 1
     892 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     893 t3.drop_duplicates(inplace=True)
     894 
     895 t4 = feature2[['user_id','merchant_id','date','coupon_id']]
     896 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
     897 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
     898 t4['user_merchant_buy_common'] = 1
     899 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     900 t4.drop_duplicates(inplace=True)
     901 
     902 user_merchant2 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
     903 user_merchant2 = pd.merge(user_merchant2,t1,on=['user_id','merchant_id'],how='left')
     904 user_merchant2 = pd.merge(user_merchant2,t2,on=['user_id','merchant_id'],how='left')
     905 user_merchant2 = pd.merge(user_merchant2,t3,on=['user_id','merchant_id'],how='left')
     906 user_merchant2 = pd.merge(user_merchant2,t4,on=['user_id','merchant_id'],how='left')
     907 user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(np.nan,0)
     908 user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(np.nan,0)
     909 user_merchant2['user_merchant_coupon_transfer_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_received.astype('float')
     910 user_merchant2['user_merchant_coupon_buy_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_buy_total.astype('float')
     911 user_merchant2['user_merchant_rate'] = user_merchant2.user_merchant_buy_total.astype('float') / user_merchant2.user_merchant_any.astype('float')
     912 user_merchant2['user_merchant_common_buy_rate'] = user_merchant2.user_merchant_buy_common.astype('float') / user_merchant2.user_merchant_buy_total.astype('float')
     913 user_merchant2.to_csv('data/user_merchant2.csv',index=None)
     914 
     915 #for dataset2
     916 all_user_merchant = feature1[['user_id','merchant_id']]
     917 all_user_merchant.drop_duplicates(inplace=True)
     918 
     919 t = feature1[['user_id','merchant_id','date']]
     920 t = t[t.date!='null'][['user_id','merchant_id']]
     921 t['user_merchant_buy_total'] = 1
     922 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     923 t.drop_duplicates(inplace=True)
     924 
     925 t1 = feature1[['user_id','merchant_id','coupon_id']]
     926 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
     927 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
     928 t1['user_merchant_received'] = 1
     929 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     930 t1.drop_duplicates(inplace=True)
     931 
     932 t2 = feature1[['user_id','merchant_id','date','date_received']]
     933 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
     934 t2['user_merchant_buy_use_coupon'] = 1
     935 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     936 t2.drop_duplicates(inplace=True)
     937 
     938 t3 = feature1[['user_id','merchant_id']]
     939 t3['user_merchant_any'] = 1
     940 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     941 t3.drop_duplicates(inplace=True)
     942 
     943 t4 = feature1[['user_id','merchant_id','date','coupon_id']]
     944 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
     945 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
     946 t4['user_merchant_buy_common'] = 1
     947 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
     948 t4.drop_duplicates(inplace=True)
     949 
     950 user_merchant1 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
     951 user_merchant1 = pd.merge(user_merchant1,t1,on=['user_id','merchant_id'],how='left')
     952 user_merchant1 = pd.merge(user_merchant1,t2,on=['user_id','merchant_id'],how='left')
     953 user_merchant1 = pd.merge(user_merchant1,t3,on=['user_id','merchant_id'],how='left')
     954 user_merchant1 = pd.merge(user_merchant1,t4,on=['user_id','merchant_id'],how='left')
     955 user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(np.nan,0)
     956 user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(np.nan,0)
     957 user_merchant1['user_merchant_coupon_transfer_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_received.astype('float')
     958 user_merchant1['user_merchant_coupon_buy_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_buy_total.astype('float')
     959 user_merchant1['user_merchant_rate'] = user_merchant1.user_merchant_buy_total.astype('float') / user_merchant1.user_merchant_any.astype('float')
     960 user_merchant1['user_merchant_common_buy_rate'] = user_merchant1.user_merchant_buy_common.astype('float') / user_merchant1.user_merchant_buy_total.astype('float')
     961 user_merchant1.to_csv('data/user_merchant1.csv',index=None)
     962 
     963 ##################  generate training and testing set ################
     964 def get_label(s):
     965     s = s.split(':')
     966     if s[0]=='nan':
     967         return 0
     968     elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
     969         return 1
     970     else:
     971         return -1
     972 
     973 
     974 coupon3 = pd.read_csv('data/coupon3_feature.csv')
     975 merchant3 = pd.read_csv('data/merchant3_feature.csv')
     976 user3 = pd.read_csv('data/user3_feature.csv')
     977 user_merchant3 = pd.read_csv('data/user_merchant3.csv')
     978 other_feature3 = pd.read_csv('data/other_feature3.csv')
     979 dataset3 = pd.merge(coupon3,merchant3,on='merchant_id',how='left')
     980 dataset3 = pd.merge(dataset3,user3,on='user_id',how='left')
     981 dataset3 = pd.merge(dataset3,user_merchant3,on=['user_id','merchant_id'],how='left')
     982 dataset3 = pd.merge(dataset3,other_feature3,on=['user_id','coupon_id','date_received'],how='left')
     983 dataset3.drop_duplicates(inplace=True)
     984 print (dataset3.shape)
     985 
     986 dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(np.nan,0)
     987 dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan,0)
     988 dataset3.user_merchant_received = dataset3.user_merchant_received.replace(np.nan,0)
     989 dataset3['is_weekend'] = dataset3.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
     990 weekday_dummies = pd.get_dummies(dataset3.day_of_week)
     991 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
     992 dataset3 = pd.concat([dataset3,weekday_dummies],axis=1)
     993 dataset3.drop(['merchant_id','day_of_week','coupon_count'],axis=1,inplace=True)
     994 dataset3 = dataset3.replace('null',np.nan)
     995 dataset3.to_csv('data/dataset3.csv',index=None)
     996 
     997 
     998 coupon2 = pd.read_csv('data/coupon2_feature.csv')
     999 merchant2 = pd.read_csv('data/merchant2_feature.csv')
    1000 user2 = pd.read_csv('data/user2_feature.csv')
    1001 user_merchant2 = pd.read_csv('data/user_merchant2.csv')
    1002 other_feature2 = pd.read_csv('data/other_feature2.csv')
    1003 dataset2 = pd.merge(coupon2,merchant2,on='merchant_id',how='left')
    1004 dataset2 = pd.merge(dataset2,user2,on='user_id',how='left')
    1005 dataset2 = pd.merge(dataset2,user_merchant2,on=['user_id','merchant_id'],how='left')
    1006 dataset2 = pd.merge(dataset2,other_feature2,on=['user_id','coupon_id','date_received'],how='left')
    1007 dataset2.drop_duplicates(inplace=True)
    1008 print( dataset2.shape)
    1009 
    1010 dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(np.nan,0)
    1011 dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan,0)
    1012 dataset2.user_merchant_received = dataset2.user_merchant_received.replace(np.nan,0)
    1013 dataset2['is_weekend'] = dataset2.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
    1014 weekday_dummies = pd.get_dummies(dataset2.day_of_week)
    1015 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
    1016 dataset2 = pd.concat([dataset2,weekday_dummies],axis=1)
    1017 dataset2['label'] = dataset2.date.astype('str') + ':' +  dataset2.date_received.astype('str')
    1018 dataset2.label = dataset2.label.apply(get_label)
    1019 dataset2.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True)
    1020 dataset2 = dataset2.replace('null',np.nan)
    1021 dataset2.to_csv('data/dataset2.csv',index=None)
    1022 
    1023 
    1024 coupon1 = pd.read_csv('data/coupon1_feature.csv')
    1025 merchant1 = pd.read_csv('data/merchant1_feature.csv')
    1026 user1 = pd.read_csv('data/user1_feature.csv')
    1027 user_merchant1 = pd.read_csv('data/user_merchant1.csv')
    1028 other_feature1 = pd.read_csv('data/other_feature1.csv')
    1029 dataset1 = pd.merge(coupon1,merchant1,on='merchant_id',how='left')
    1030 dataset1 = pd.merge(dataset1,user1,on='user_id',how='left')
    1031 dataset1 = pd.merge(dataset1,user_merchant1,on=['user_id','merchant_id'],how='left')
    1032 dataset1 = pd.merge(dataset1,other_feature1,on=['user_id','coupon_id','date_received'],how='left')
    1033 dataset1.drop_duplicates(inplace=True)
    1034 print (dataset1.shape)
    1035 
    1036 dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(np.nan,0)
    1037 dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan,0)
    1038 dataset1.user_merchant_received = dataset1.user_merchant_received.replace(np.nan,0)
    1039 dataset1['is_weekend'] = dataset1.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
    1040 weekday_dummies = pd.get_dummies(dataset1.day_of_week)
    1041 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
    1042 dataset1 = pd.concat([dataset1,weekday_dummies],axis=1)
    1043 dataset1['label'] = dataset1.date.astype('str') + ':' +  dataset1.date_received.astype('str')
    1044 dataset1.label = dataset1.label.apply(get_label)
    1045 dataset1.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True)
    1046 dataset1 = dataset1.replace('null',np.nan)
    1047 dataset1.to_csv('data/dataset1.csv',index=None)
    View Code

    GBDT训练部分:

      1 # -*- coding: utf-8 -*-
      2 """
      3 Created on Mon Dec  9 13:21:52 2019
      4 
      5 @author: wenzhe.tian
      6 
      7 
      8 GBDT调参顺序 可使用GridSearchCV
      9 1. n_estimators & learning_rate
     10 2. max_depth 或可为1
     11 3. min_samples_split & min_samples_leaf
     12 4. max_features 特征较多时考虑取部分
     13 5. subsample 使用样本的比例
     14 """
     15 from sklearn.preprocessing import MinMaxScaler
     16 import pandas as pd
     17 import numpy as np
     18 from sklearn import ensemble
     19 from sklearn.ensemble import GradientBoostingRegressor
     20 from sklearn.grid_search import GridSearchCV
     21 import pickle
     22 import os
     23 import datetime
     24 dataset1 = pd.read_csv('data/dataset1.csv')
     25 dataset1.label.replace(-1,0,inplace=True)
     26 dataset2 = pd.read_csv('data/dataset2.csv')
     27 dataset2.label.replace(-1,0,inplace=True)
     28 dataset3 = pd.read_csv('data/dataset3.csv')
     29 
     30 dataset1.drop_duplicates(inplace=True)
     31 dataset2.drop_duplicates(inplace=True)
     32 dataset3.drop_duplicates(inplace=True)
     33 
     34 dataset12 = pd.concat([dataset1,dataset2],axis=0)
     35 
     36 dataset1_y = dataset1.label
     37 dataset1_x = dataset1.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)  # 'day_gap_before','day_gap_after' cause overfitting, 0.77
     38 dataset2_y = dataset2.label
     39 dataset2_x = dataset2.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
     40 dataset12_y = dataset12.label
     41 dataset12_x = dataset12.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
     42 dataset3_preds = dataset3[['user_id','coupon_id','date_received']]
     43 dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)
     44 
     45 dataset12_x=dataset12_x.fillna(-100)
     46 dataset3_x=dataset3_x.fillna(-100)
     47 
     48 
     49 
     50 
     51 score=[]
     52 # 深度调参
     53 #gbdt=ensemble.GradientBoostingRegressor(
     54 #  loss='ls'
     55 #, learning_rate=0.1
     56 #, n_estimators=1000
     57 #, subsample=1
     58 #, min_samples_split=15
     59 #, min_samples_leaf=10
     60 #, max_depth=3
     61 #, init=None
     62 #, random_state=None
     63 #, max_features=None
     64 #, alpha=0.9
     65 #, verbose=1
     66 #, max_leaf_nodes=None
     67 #, warm_start=False)
     68 #gbdt.fit(dataset12_x,dataset12_y)
     69 #
     70 #score.append(gbdt.score(dataset12_x,dataset12_y))
     71 
     72 #dataset3_preds['label'] = gbdt.predict(dataset3_x)
     73 #dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label.reshape(-1, 1))
     74 #dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True)
     75 #dataset3_preds.to_csv("test.csv",index=None,header=None)
     76 
     77 # 深度调参
     78 gbdt=ensemble.GradientBoostingRegressor(
     79   loss='ls'
     80 , learning_rate=0.11
     81 , n_estimators=160
     82 , subsample=1
     83 , min_samples_split=4  #4 is best
     84 , min_samples_leaf=1
     85 , max_depth=4
     86 , init=None
     87 , random_state=None
     88 , max_features=None
     89 , alpha=0.9
     90 , verbose=1
     91 , max_leaf_nodes=None
     92 , warm_start=False)
     93 gbdt.fit(dataset12_x,dataset12_y)
     94 
     95 score.append(gbdt.score(dataset12_x,dataset12_y))
     96 
     97 
     98 dataset3_preds['label'] = gbdt.predict(dataset3_x)
     99 dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label.reshape(-1, 1))
    100 dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True)
    101 dataset3_preds.to_csv("results"+datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S')+".csv",index=None,header=None)
    102 
    103 #os.system('shutdown -s -t 10')
    View Code

    GridSearchCV的一个调参案例:

    param_test2 = {'n_estimators':[50,75,100,125,150,200,300,400]}
    # 迭代次数调参
    gsearch2 = GridSearchCV(
        estimator=GradientBoostingRegressor(
          loss='ls'
        , learning_rate=0.1
    #    , n_estimators=125
        , subsample=1
        , min_samples_split=5 #2
        , min_samples_leaf=1   #1
        , max_depth=5
        , init=None
        , random_state=None
        , max_features=None
        , alpha=0.9
        , verbose=1
        , max_leaf_nodes=None
        , warm_start=False
        ),param_grid = param_test2, scoring='roc_auc',iid=False,cv=None)
    
    gsearch2.fit(dataset12_x,dataset12_y)
    ans_vid1={}
    ans_vid1['grid_scores']=gsearch2.grid_scores_
    ans_vid1['best_params']=gsearch2.best_params_
    ans_vid1['best_score']=gsearch2.best_score_
  • 相关阅读:
    python练习1--求和
    python--文件操作
    【省选】SDOI2017_树点涂色_LuoguP3703/LOJ2001_LCT/线段树
    【洛谷】P1784 数独
    【洛谷】P2671 求和
    【洛谷】P2261 [CQOI2007]余数求和
    【转载】利用向量积(叉积)计算三角形的面积和多边形的面积
    【洛谷】P1090 合并果子
    【转载】学习C++ -> 类(Classes)的定义与实现
    【洛谷】P2142 高精度减法
  • 原文地址:https://www.cnblogs.com/techs-wenzhe/p/12901297.html
Copyright © 2020-2023  润新知