• O2O淘宝优惠券代码总结


    一、数据集预处理

      1、数据读入

    import pandas as pd
    import numpy as np
    import datetime as date
    import datetime as dt
    
    #先导入数据
    
    off_train = pd.read_csv("data/ccf_offline_stage1_train.csv",header = 0)
    off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']
    # read_csv 读入数据(header = 0)不读入表头,第二句设置表头
    off_test = pd.read_csv("data/ccf_offline_stage1_test_revised.csv",header = 0)
    off_test.columns  = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']
    
    on_train = pd.read_csv("data/ccf_online_stage1_train.csv",header=0)
    on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']
    

      

      2、数据划分

    # 按照时间划分训练集和测试集
    # 滑窗法划分
    """
    将2016年1月1日到4月13日的数据提取特征,利用4月14日的到5月14日的作为测试集
    将2月1日到5月14日的作为数据集提取特征,利用5月15日6月15日的作为测试集
    将3月15日到6月30日作为数据集提取特征,再测试7月1日到7月31日的数据
    dataset用来做测试集,feature用来做训练集
    """
    #数据集3的特征为 取 线上数据中领券和用券日期大于3月15日和小于6月30日的
    #将3月15日到6月30日作为数据集提取特征,再测试7月1日到7月31日的数据
    #使数据集3等于test集
    """"
    dataset里面只有接收优惠券的记录的,无消费记录,可用于预测
    feature里面存的是优惠券使用日期或接收优惠券的时间介于3月15日到6月30之间的记录
    """
    dataset3 = off_test
    feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630')|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630')))]
    
    #提取数据集2的测试集
    #将2月1日到5月14日的作为数据集提取特征,利用5月15日6月15日的作为测试集
    """"
    dataset里面只存放优惠券接收日期介于5月15日到6月15之间的记录
    feature里面存的是优惠券使用日期或接收优惠券的时间介于2月1日到5月14之间的记录
    """
    dataset2 = off_train[((off_train.date_received>='20160515')&(off_train.date_received<='20160615'))]
    
    feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))]
    """"
    dataset里面只存放优惠券接收日期介于4月14日到5月14之间的记录
    feature里面存的是优惠券使用日期或接收优惠券的时间介于1月1日到4月13之间的记录
    """
    dataset1 = off_train[(off_train.date_received>='201604014')&(off_train.date_received<='20160514')]
    feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))]
    

    二、特征工程

      1、提取其他特征

    """
    # 提取特征:
        用户领取的所有优惠券数目
        ◦用户领取的特定优惠券数目
        ◦用户此次之后/前领取的所有优惠券数目
        ◦用户此次之后/前领取的特定优惠券数目
        ◦用户上/下一次领取的时间间隔
        ◦用户领取特定商家的优惠券数目
        ◦用户领取的不同商家数目
        ◦用户当天领取的优惠券数目
        ◦用户当天领取的特定优惠券数目
        ◦用户领取的所有优惠券种类数目
        ◦商家被领取的优惠券数目
        ◦商家被领取的特定优惠券数目
        ◦商家被多少不同用户领取的数目
        ◦商家发行的所有优惠券种类数目
    """
    # 对dataset3进行操作
    # 用户收到的优惠券总和
    t = dataset3[['user_id']]
    t['this_month_user_received_all_coupon_count'] = 1
     #将t按照用户id进行分组,然后统计所有用户收取的优惠券数目,并初始化一个索引值
    t = t.groupby('user_id').agg('sum').reset_index()
    # 用户收到特定优惠券的总和
    t1 = dataset3[['user_id','coupon_id']]
    t1['this_month_user_receive_same_coupon_count'] = 1
    t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
    
    # 用户此次之前或之后领使用优惠券的时间
    # lambda x:':'.join(x)  是添加冒号并在后面去加字符
    # 将接收时间的一组按着':'分开,这样就可以计算接受了优惠券的数量,apply是合并
    # 最大接受的日期max_date_received/min_date_received
    t2 = dataset3[['user_id','coupon_id','date_received']]
    t2.date_received = t2.date_received.astype('str')
    t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
    t2 = t2[t2.receive_number>1]
    t2['max_date_received'] = t2.date_received.apply(lambda s:max([int (d) for d in s.split(':')]))
    t2['min_date_received'] = t2.date_received.apply(lambda s:min([int (d) for d in s.split(':')]))
    t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
    
    # 将表格中接收优惠券日期中为最近和最远的日期时置为1其余为0,若只接受了一次优惠券为-1
    # 将两表融合只保留左表数据,这样得到的表,相当于保留了最近接收时间和最远接受时间
    t3 = dataset3[['user_id','coupon_id','date_received']]
    t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
    t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype(int)
    
    t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype(int) - t3.min_date_received
    
    def isfirstlastone(x):
        
        if x == 0:
            return 1
        elif x > 0:
            return 0
        else:
            return -1
    # 只接受过一次优惠券为者为 -1
    
    
    t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(isfirstlastone)
    t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(isfirstlastone)
    
    # 第四个特征,一个用户所接收到的所有优惠券的数量
    t4 = dataset3[['user_id','date_received']]
    t4['this_day_user_receive_all_coupon_count'] = 1
    t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
    
    # 提取第五个特征,一个用户不同时间所接收到不同优惠券的数量
    t5 = dataset3[['user_id','coupon_id','date_received']]
    t5['this_day_user_receive_same_coupon_count'] = 1
    t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
    
    # 一个用户不同优惠券 的接受时间
    t6 = dataset3[['user_id','coupon_id','date_received']]
    t6.date_received = t6.date_received.astype('str')
    t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t6.rename(columns ={'date_received':'dates'},inplace = True)
    
    # 接收优惠券最近的日子天数
    def get_day_gap_before(s):
        date_received,dates = s.split('-')
        dates = dates.split(':')
        gaps = []
        for d in dates:
    #         print(date_received.type())
            this_gap = (dt.date(int(date_received[1:4]),(int(date_received[4:6])),(int(date_received[6:8]))) - dt.date((int(d[1:4])),(int(d[4:6])),(int(d[6:8])))).days
            if this_gap>0:
                gaps.append(this_gap)
        if len(gaps) == 0:
            return -1
        else:
            return min(gaps)
    # 接收优惠券最远的日子天数
    def get_day_gap_after(s):
        date_received,dates = s.split('-')
        dates = dates.split(':')
        gaps = []
        for d in dates:
            this_gap = (dt.datetime(int(d[0:4]),int(d[4:6]),int(d[6:8])) - dt.datetime(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
            if this_gap>0:
                gaps.append(this_gap)
        if len(gaps) == 0:
            return -1
        else:
            return min(gaps)
    
    t7 = dataset3[['user_id','coupon_id','date_received']]
    t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
    t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates.astype('str')
    
    t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
    t7['day_gap_after']  = t7.date_received_date.apply(get_day_gap_after)
    t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
    # feature3 提取的特征存入CSV中
    other_feature3 = pd.merge(t1,t,on='user_id')
    other_feature3 = pd.merge(other_feature3,t3,on=['user_id','coupon_id'])
    other_feature3 = pd.merge(other_feature3,t4,on=['user_id','date_received'])
    other_feature3 = pd.merge(other_feature3,t5,on=['user_id','coupon_id','date_received'])
    other_feature3 = pd.merge(other_feature3,t7,on=['user_id','coupon_id','date_received'])
    other_feature3.to_csv('feature/other_feature3.csv',index=None)
    
    
    
    # 处理dataset2
    t = dataset2[['user_id']]
    t['this_month_user_received_all_coupon_count'] = 1
    t = t.groupby('user_id').agg('sum').reset_index()
    t1 = dataset2[['user_id','coupon_id']]
    t1['this_month_user_receive_same_coupon_count'] = 1
    t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
    t2 = dataset3[['user_id','coupon_id','date_received']]
    t2.date_received = t2.date_received.astype('str')
    t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
    t2 = t2[t2.receive_number>1]
    t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(':')]))
    t2['min_date_received'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(':')]))
    t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
    
    t3 = dataset2[['user_id','coupon_id','date_received']]
    t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
    t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('int')
    t3['this_month_user_receive_same_coupon_firstone']= t3.date_received.astype('int') - t3.min_date_received
    t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(isfirstlastone)
    t3.this_month_user_receive_same_coupon_firstone= t3.this_month_user_receive_same_coupon_firstone.apply(isfirstlastone)
    
    t4 = dataset2[['user_id','date_received']]
    t4['this_day_user_receive_all_coupon_count'] = 1
    t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
    t5 = dataset2[['user_id','coupon_id','date_received']]
    t5['this_day_user_receive_same_coupon_count'] = 1
    t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
    
    t6 = dataset2[['user_id','coupon_id','date_received']]
    t6.date_received = t6.date_received.astype('str')
    t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t6.rename(columns={'date_received':'dates'},inplace=True)
    
    t7 = dataset2[['user_id','coupon_id','date_received']]
    t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
    t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
    t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
    t7['day_gap_after']  = t7.date_received_date.apply(get_day_gap_before)
    t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
    
    other_feature2 = pd.merge(t1,t,on='user_id')
    other_feature2 = pd.merge(other_feature2,t3,on=['user_id','coupon_id'])
    other_feature2 = pd.merge(other_feature2,t4,on=['user_id','date_received'])
    other_feature2 = pd.merge(other_feature2,t5,on=['user_id','coupon_id','date_received'])
    other_feature2 = pd.merge(other_feature2,t7,on=['user_id','coupon_id','date_received'])
    other_feature2.to_csv('feature/other_feature2.csv',index=None)
    
    # 处理dataset1
    t = dataset1[['user_id']]
    t['this_month_user_received_all_coupon_count'] = 1
    t = t.groupby('user_id').agg('sum').reset_index()
    t1 = dataset1[['user_id','coupon_id']]
    t1['this_month_user_receive_same_coupon_count'] = 1
    t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
    t2 = dataset1[['user_id','coupon_id','date_received']]
    t2.date_received = t2.date_received.astype('str')
    t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
    t2 = t2[t2.receive_number>1]
    t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(':')]))
    t2['min_date_received'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(':')]))
    t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
    
    t3 = dataset1[['user_id','coupon_id','date_received']]
    t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
    t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('int')
    t3['this_month_user_receive_same_coupon_firstone']= t3.date_received.astype('int') - t3.min_date_received
    t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(isfirstlastone)
    t3.this_month_user_receive_same_coupon_firstone= t3.this_month_user_receive_same_coupon_firstone.apply(isfirstlastone)
    
    t4 = dataset1[['user_id','date_received']]
    t4['this_day_user_receive_all_coupon_count'] = 1
    t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
    t5 = dataset1[['user_id','coupon_id','date_received']]
    t5['this_day_user_receive_same_coupon_count'] = 1
    t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
    
    t6 = dataset1[['user_id','coupon_id','date_received']]
    t6.date_received = t6.date_received.astype('str')
    t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t6.rename(columns={'date_received':'dates'},inplace=True)
    
    t7 = dataset1[['user_id','coupon_id','date_received']]
    t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
    t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
    t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
    t7['day_gap_after']  = t7.date_received_date.apply(get_day_gap_before)
    t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
    
    other_feature1 = pd.merge(t1,t,on='user_id')
    other_feature1 = pd.merge(other_feature1,t3,on=['user_id','coupon_id'])
    other_feature1 = pd.merge(other_feature1,t4,on=['user_id','date_received'])
    other_feature1 = pd.merge(other_feature1,t5,on=['user_id','coupon_id','date_received'])
    other_feature1 = pd.merge(other_feature1,t7,on=['user_id','coupon_id','date_received'])
    other_feature1.to_csv('feature/other_feature1.csv',index=None)
    

      2、提取优惠券相关特征

    # 统一转化为打折卷
    def calc_discount_rate(s):
        s = str(s)
        s = s.split(':')
        if len(s) == 1:
            return float(s[0])
        else:
            return 1.0-float(s[1])/float(s[0])
    def get_discount_man(s):
        s = str(s)
        s = s.split(':')
        if len(s) == 1:
            return 'null'
        else:
            return int(s[0])
    def get_discount_jian(s):
        s = str(s)
        s = s.split(':')
        if len(s) == 1:
            return 'null'
        else:
            return int(s[1])
    def is_man_jian(s):
        s = str(s)
        s = s.split(':')
        if len(s) == 1:
            return 0
        else:
            return 1
    # 处理数据集3,处理时间属性,显示时间是第几周
    dataset3['day_of_week']  = dataset3.date_received.astype('str').apply(lambda x:(dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1))
    dataset3['day_of_month'] = dataset3.date_received.astype('str').apply(lambda x:int(x[6:8]))
    dataset3['days_distance']= dataset3.date_received.astype('str').apply(lambda x:(dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-dt.date(2016,6,30)).days)
    dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man)
    dataset3['discount_jian']= dataset3.discount_rate.apply(get_discount_jian)
    dataset3['is_man_jian']  = dataset3.discount_rate.apply(is_man_jian)
    dataset3['discount_rate']= dataset3.discount_rate.apply(calc_discount_rate)
    d = dataset3[['coupon_id']]
    d['coupon_count'] = 1
    d = d.groupby('coupon_id').agg('sum').reset_index()
    dataset3 = pd.merge(dataset3,d,on='coupon_id',how='left')
    dataset3.to_csv('feature/coupon3_feature.csv',index=None)
    
    # 数据集2
    dataset2['day_of_week']  = dataset2.date_received.astype('str').apply(lambda x:dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
    dataset2['day_of_month'] = dataset2.date_received.astype('str').apply(lambda x:int(x[4:6]))
    dataset2['days_distance']= dataset2.date_received.astype('str').apply(lambda x:(dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-dt.date(2016,5,14)).days)
    dataset2['discount_man'] = dataset2.discount_rate.apply(get_discount_man)
    dataset2['discount_jian']= dataset2.discount_rate.apply(get_discount_jian)
    dataset2['is_man_jian']  = dataset2.discount_rate.apply(is_man_jian)
    dataset2['discount_rate']= dataset2.discount_rate.apply(calc_discount_rate)
    d = dataset2[['coupon_id']]
    d['coupon_count'] = 1
    d = d.groupby('coupon_id').agg('sum').reset_index()
    dataset2 = pd.merge(dataset2,d,on='coupon_id',how='left')
    dataset2.to_csv('feature/coupon2_feature.csv',index=None)
    
    # 数据集1
    dataset1['day_of_week']  = dataset1.date_received.astype('str').apply(lambda x:dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
    dataset1['day_of_month'] = dataset1.date_received.astype('str').apply(lambda x:int(x[4:6]))
    dataset1['days_distance']= dataset1.date_received.astype('str').apply(lambda x:(dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-dt.date(2016,4,16)).days)
    dataset1['discount_man'] = dataset1.discount_rate.apply(get_discount_man)
    dataset1['discount_jian']= dataset1.discount_rate.apply(get_discount_jian)
    dataset1['is_man_jian']  = dataset1.discount_rate.apply(is_man_jian)
    dataset1['discount_rate']= dataset1.discount_rate.apply(calc_discount_rate)
    d = dataset1[['coupon_id']]
    d['coupon_count'] = 1
    d = d.groupby('coupon_id').agg('sum').reset_index()
    dataset1 = pd.merge(dataset1,d,on='coupon_id',how='left')
    dataset1.to_csv('feature/coupon1_feature.csv',index=None)
    

      3、提取商户相关特征

    merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']]
    t = merchant3[['merchant_id']]
    # 删除重复的行数据
    t.drop_duplicates(inplace=True)
    # 显示卖出的商品,以及卖出的数量
    # []用来强调条件或者新建一列并赋值   [[]]用来表示取哪个列来使用
    t1 = merchant3[merchant3.date!='null'][['merchant_id']]
    t1['total_sales'] = 1
    t1 = t1.groupby('merchant_id').agg('sum').reset_index()
    # 显示使用了优惠券消费的商品,正样本
    t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']]
    t2['sales_use_coupon'] = 1
    t2 = t2.groupby('merchant_id').agg('sum').reset_index()
    # 提取商品优惠券的总数量
    t3 = merchant3[merchant3.coupon_id!='null'][['merchant_id']]
    t3['total_coupon'] = 1
    t3 = t3.groupby('merchant_id').agg('sum').reset_index()
    # 提取销量与距离的关系
    # 把数据中的空值全部替换为 -1
    t4 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id','distance']]
    t4.replace('null',-1,inplace=True)
    t4.distance = t4.distance.astype('int')
    t4.replace(-1,np.nan,inplace=True)
    # 提取用户和商店距离的最小值
    t5 = t4.groupby('merchant_id').agg('min').reset_index()
    t5.rename(columns={'distance':'merchant_min_distance'},inplace = True)
    # 提取用户和商店距离的最大值
    t6 = t4.groupby('merchant_id').agg('max').reset_index()
    t5.rename(columns={'distance':'merchant_max_distance'},inplace = True)
    # 提取用户和商品距离的平均值
    t7 = t4.groupby('merchant_id').agg('mean').reset_index()
    t7.rename(columns={'distance':'merchant_mean_distance'},inplace = True)
    # 提取用户与商品距离的中位数
    t8 = t4.groupby('merchant_id').agg('median').reset_index()
    # 把特征集合入一张表里
    merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left')
    # merchant3_feature.head()
    # 替换数据中的NAN为0,便于计算优惠券的使用率以及其他信息
    # 优惠券的使用率、卖出的商品中使用优惠券的占比
    merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0)
    merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
    merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
    merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0)
    merchant3_feature.to_csv('feature/merchant3_feature.csv',index=None)
    
    
    # 对feature2进行操作
    merchant2 = feature2[['merchant_id','coupon_id','distance','date_received','date']]
    t = merchant2[['merchant_id']]
    # 删除重复的行数据
    t.drop_duplicates(inplace=True)
    # 显示卖出的商品,以及卖出的数量
    # []用来强调条件或者新建一列并赋值   [[]]用来表示取哪个列来使用
    t1 = merchant2[merchant2.date!='null'][['merchant_id']]
    t1['total_sales'] = 1
    t1 = t1.groupby('merchant_id').agg('sum').reset_index()
    # 显示使用了优惠券消费的商品,正样本
    t2 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id']]
    t2['sales_use_coupon'] = 1
    t2 = t2.groupby('merchant_id').agg('sum').reset_index()
    # 提取商品优惠券的总数量
    t3 = merchant2[merchant2.coupon_id!='null'][['merchant_id']]
    t3['total_coupon'] = 1
    t3 = t3.groupby('merchant_id').agg('sum').reset_index()
    # 提取销量与距离的关系
    # 把数据中的空值全部替换为 -1
    t4 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id','distance']]
    t4.replace('null',-1,inplace=True)
    t4.distance = t4.distance.astype('int')
    t4.replace(-1,np.nan,inplace=True)
    # 提取用户和商店距离的最小值
    t5 = t4.groupby('merchant_id').agg('min').reset_index()
    t5.rename(columns={'distance':'merchant_min_distance'},inplace = True)
    # 提取用户和商店距离的最大值
    t6 = t4.groupby('merchant_id').agg('max').reset_index()
    t5.rename(columns={'distance':'merchant_max_distance'},inplace = True)
    # 提取用户和商品距离的平均值
    t7 = t4.groupby('merchant_id').agg('mean').reset_index()
    t7.rename(columns={'distance':'merchant_mean_distance'},inplace = True)
    # 提取用户与商品距离的中位数
    t8 = t4.groupby('merchant_id').agg('median').reset_index()
    # 把特征集合入一张表里
    merchant2_feature = pd.merge(t,t1,on='merchant_id',how='left')
    merchant2_feature = pd.merge(merchant2_feature,t2,on='merchant_id',how='left')
    merchant2_feature = pd.merge(merchant2_feature,t3,on='merchant_id',how='left')
    merchant2_feature = pd.merge(merchant2_feature,t5,on='merchant_id',how='left')
    merchant2_feature = pd.merge(merchant2_feature,t6,on='merchant_id',how='left')
    merchant2_feature = pd.merge(merchant2_feature,t7,on='merchant_id',how='left')
    merchant2_feature = pd.merge(merchant2_feature,t8,on='merchant_id',how='left')
    # merchant3_feature.head()
    # 替换数据中的NAN为0,便于计算优惠券的使用率以及其他信息
    # 优惠券的使用率、卖出的商品中使用优惠券的占比
    merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(np.nan,0)
    merchant2_feature['merchant_coupon_transfer_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_sales
    merchant2_feature['coupon_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_sales
    merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(np.nan,0)
    merchant2_feature.to_csv('feature/merchant2_feature.csv',index=None)
    
    
    
    # 对feature1进行操作
    merchant1 = feature1[['merchant_id','coupon_id','distance','date_received','date']]
    t = merchant1[['merchant_id']]
    # 删除重复的行数据
    t.drop_duplicates(inplace=True)
    # 显示卖出的商品,以及卖出的数量
    # []用来强调条件或者新建一列并赋值   [[]]用来表示取哪个列来使用
    t1 = merchant1[merchant1.date!='null'][['merchant_id']]
    t1['total_sales'] = 1
    t1 = t1.groupby('merchant_id').agg('sum').reset_index()
    # 显示使用了优惠券消费的商品,正样本
    t2 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id']]
    t2['sales_use_coupon'] = 1
    t2 = t2.groupby('merchant_id').agg('sum').reset_index()
    # 提取商品优惠券的总数量
    t3 = merchant1[merchant1.coupon_id!='null'][['merchant_id']]
    t3['total_coupon'] = 1
    t3 = t3.groupby('merchant_id').agg('sum').reset_index()
    # 提取销量与距离的关系
    # 把数据中的空值全部替换为 -1
    t4 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id','distance']]
    t4.replace('null',-1,inplace=True)
    t4.distance = t4.distance.astype('int')
    t4.replace(-1,np.nan,inplace=True)
    # 提取用户和商店距离的最小值
    t5 = t4.groupby('merchant_id').agg('min').reset_index()
    t5.rename(columns={'distance':'merchant_min_distance'},inplace = True)
    # 提取用户和商店距离的最大值
    t6 = t4.groupby('merchant_id').agg('max').reset_index()
    t5.rename(columns={'distance':'merchant_max_distance'},inplace = True)
    # 提取用户和商品距离的平均值
    t7 = t4.groupby('merchant_id').agg('mean').reset_index()
    t7.rename(columns={'distance':'merchant_mean_distance'},inplace = True)
    # 提取用户与商品距离的中位数
    t8 = t4.groupby('merchant_id').agg('median').reset_index()
    # 把特征集合入一张表里
    merchant1_feature = pd.merge(t,t1,on='merchant_id',how='left')
    merchant1_feature = pd.merge(merchant1_feature,t2,on='merchant_id',how='left')
    merchant1_feature = pd.merge(merchant1_feature,t3,on='merchant_id',how='left')
    merchant1_feature = pd.merge(merchant1_feature,t5,on='merchant_id',how='left')
    merchant1_feature = pd.merge(merchant1_feature,t6,on='merchant_id',how='left')
    merchant1_feature = pd.merge(merchant1_feature,t7,on='merchant_id',how='left')
    merchant1_feature = pd.merge(merchant1_feature,t8,on='merchant_id',how='left')
    # merchant3_feature.head()
    # 替换数据中的NAN为0,便于计算优惠券的使用率以及其他信息
    # 优惠券的使用率、卖出的商品中使用优惠券的占比
    merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(np.nan,0)
    merchant1_feature['merchant_coupon_transfer_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_sales
    merchant1_feature['coupon_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_sales
    merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(np.nan,0)
    merchant1_feature.to_csv('feature/merchant1_feature.csv',index=None)
    

      4、提取用户的相关特征

    """
    用户的相关信息:
        count_merchant
        user_avg_distance,user_min_distance,user_max_distance
        buy_use_coupon,buy_total,coupon_received
        buy_use_coupon/coupon_received
        buy_use_coupon/buy_total
        user_date_datereceived_gap
    """
    def get_user_date_datereceived_gap(s):
        s = s.split(':')
        return(dt.date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - dt.date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days
    # 数据集3的处理
    user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
    t = user3[['user_id']]
    # 去掉数据中重复的用户ID
    t.drop_duplicates(inplace=True)
    # 用户购买商品的种类
    t1 = user3[user3.date!='null'][['user_id','merchant_id']]
    t1.drop_duplicates(inplace=True)
    t1.merchant_id = 1
    t1 = t1.groupby('user_id').agg('sum').reset_index()
    t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
    # 使用了优惠券购买商品的用户id和距离
    t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']]
    t2.replace('null',-1,inplace=True)
    t2.distance = t2.distance.astype('int')
    t2.replace(-1,np.nan,inplace=True)
    # 得到使用优惠券购买商品的用户距商店的最短距离
    t3 = t2.groupby('user_id').agg('min').reset_index()
    t3.rename(columns={'distance':'user_min_dsitance'},inplace=True)
    # 最大距离
    t4 = t2.groupby('user_id').agg('max').reset_index()
    t4.rename(columns={'distance':'user_max_distance'},inplace=True)
    # 平均距离
    t5 = t2.groupby('user_id').agg('mean').reset_index()
    t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
    # 中位数距离
    t6 = t2.groupby('user_id').agg('median').reset_index()
    t6.rename(columns={'distance':'user_median_distance'},inplace=True)
    # 每个用户使用优惠券购买的商品数量
    t7 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id']]
    t7['buy_use_coupon'] = 1
    t7 = t7.groupby('user_id').agg('sum').reset_index()
    # 购买商品的总数
    t8 = user3[user3.date!='null'][['user_id']]
    t8['buy_total'] = 1
    t8 = t8.groupby('user_id').agg('sum').reset_index()
    # 接收优惠券的总数
    t9 = user3[user3.coupon_id!='null'][['user_id']]
    t9['coupon_received'] = 1
    t9 = t9.groupby('user_id').agg('sum').reset_index()
    # 收到优惠券的日期和使用之间的距离
    t10 = user3[(user3.date_received !='null')&(user3.date!='null')][['user_id','date_received','date']]
    t10['user_date_datereceived_gap'] = t10.date+':'+t10.date_received
    t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
    t10 = t10[['user_id','user_date_datereceived_gap']]
    # 将用户优惠券使用时间的间隔取平均值
    t11 = t10.groupby('user_id').agg('mean').reset_index()
    t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
    # 间隔天数的最小值
    t12 = t10.groupby('user_id').agg('min').reset_index()
    t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
    # 间隔天数的最大值
    t13 = t10.groupby('user_id').agg('max').reset_index()
    t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
    # 合并特征
    user3_feature = pd.merge(t,t1,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left')
    user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
    user3_feature.buy_user_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
    user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float')/user3_feature.buy_total.astype('float')
    user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float')/user3_feature.buy_use_coupon.astype('float')
    user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)
    user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
    user3_feature.to_csv('feature/user3_feature.csv',index=None)
    
    
    # 对数据集faeture2操作
    user2 = feature2[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
    t = user2[['user_id']]
    # 去掉数据中重复的用户ID
    t.drop_duplicates(inplace=True)
    # 用户购买商品的种类
    t1 = user2[user2.date!='null'][['user_id','merchant_id']]
    t1.drop_duplicates(inplace=True)
    t1.merchant_id = 1
    t1 = t1.groupby('user_id').agg('sum').reset_index()
    t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
    # 使用了优惠券购买商品的用户id和距离
    t2 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id','distance']]
    t2.replace('null',-1,inplace=True)
    t2.distance = t2.distance.astype('int')
    t2.replace(-1,np.nan,inplace=True)
    # 得到使用优惠券购买商品的用户距商店的最短距离
    t3 = t2.groupby('user_id').agg('min').reset_index()
    t3.rename(columns={'distance':'user_min_dsitance'},inplace=True)
    # 最大距离
    t4 = t2.groupby('user_id').agg('max').reset_index()
    t4.rename(columns={'distance':'user_max_distance'},inplace=True)
    # 平均距离
    t5 = t2.groupby('user_id').agg('mean').reset_index()
    t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
    # 中位数距离
    t6 = t2.groupby('user_id').agg('median').reset_index()
    t6.rename(columns={'distance':'user_median_distance'},inplace=True)
    # 每个用户使用优惠券购买的商品数量
    t7 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id']]
    t7['buy_use_coupon'] = 1
    t7 = t7.groupby('user_id').agg('sum').reset_index()
    # 购买商品的总数
    t8 = user2[user2.date!='null'][['user_id']]
    t8['buy_total'] = 1
    t8 = t8.groupby('user_id').agg('sum').reset_index()
    # 接收优惠券的总数
    t9 = user2[user2.coupon_id!='null'][['user_id']]
    t9['coupon_received'] = 1
    t9 = t9.groupby('user_id').agg('sum').reset_index()
    # 收到优惠券的日期和使用之间的距离
    t10 = user2[(user2.date_received !='null')&(user2.date!='null')][['user_id','date_received','date']]
    t10['user_date_datereceived_gap'] = t10.date+':'+t10.date_received
    t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
    t10 = t10[['user_id','user_date_datereceived_gap']]
    # 将用户优惠券使用时间的间隔取平均值
    t11 = t10.groupby('user_id').agg('mean').reset_index()
    t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
    # 间隔天数的最小值
    t12 = t10.groupby('user_id').agg('min').reset_index()
    t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
    # 间隔天数的最大值
    t13 = t10.groupby('user_id').agg('max').reset_index()
    t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
    # 合并特征
    user2_feature = pd.merge(t,t1,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t3,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t4,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t5,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t6,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t7,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t8,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t9,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t11,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t12,on='user_id',how='left')
    user2_feature = pd.merge(user2_feature,t13,on='user_id',how='left')
    user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan,0)
    user2_feature.buy_user_coupon = user2_feature.buy_use_coupon.replace(np.nan,0)
    user2_feature['buy_use_coupon_rate'] = user2_feature.buy_use_coupon.astype('float')/user2_feature.buy_total.astype('float')
    user2_feature['user_coupon_transfer_rate'] = user2_feature.buy_use_coupon.astype('float')/user2_feature.buy_use_coupon.astype('float')
    user2_feature.buy_total = user2_feature.buy_total.replace(np.nan,0)
    user2_feature.coupon_received = user2_feature.coupon_received.replace(np.nan,0)
    user2_feature.to_csv('feature/user2_feature.csv',index=None)
    
    
    # 对数据集dataset1操作
    user1 = feature1[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
    t = user1[['user_id']]
    # 去掉数据中重复的用户ID
    t.drop_duplicates(inplace=True)
    # 用户购买商品的种类
    t1 = user1[user1.date!='null'][['user_id','merchant_id']]
    t1.drop_duplicates(inplace=True)
    t1.merchant_id = 1
    t1 = t1.groupby('user_id').agg('sum').reset_index()
    t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
    # 使用了优惠券购买商品的用户id和距离
    t2 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id','distance']]
    t2.replace('null',-1,inplace=True)
    t2.distance = t2.distance.astype('int')
    t2.replace(-1,np.nan,inplace=True)
    # 得到使用优惠券购买商品的用户距商店的最短距离
    t3 = t2.groupby('user_id').agg('min').reset_index()
    t3.rename(columns={'distance':'user_min_dsitance'},inplace=True)
    # 最大距离
    t4 = t2.groupby('user_id').agg('max').reset_index()
    t4.rename(columns={'distance':'user_max_distance'},inplace=True)
    # 平均距离
    t5 = t2.groupby('user_id').agg('mean').reset_index()
    t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
    # 中位数距离
    t6 = t2.groupby('user_id').agg('median').reset_index()
    t6.rename(columns={'distance':'user_median_distance'},inplace=True)
    # 每个用户使用优惠券购买的商品数量
    t7 = user1[(user3.date!='null')&(user1.coupon_id!='null')][['user_id']]
    t7['buy_use_coupon'] = 1
    t7 = t7.groupby('user_id').agg('sum').reset_index()
    # 购买商品的总数
    t8 = user1[user1.date!='null'][['user_id']]
    t8['buy_total'] = 1
    t8 = t8.groupby('user_id').agg('sum').reset_index()
    # 接收优惠券的总数
    t9 = user1[user1.coupon_id!='null'][['user_id']]
    t9['coupon_received'] = 1
    t9 = t9.groupby('user_id').agg('sum').reset_index()
    # 收到优惠券的日期和使用之间的距离
    t10 = user1[(user1.date_received !='null')&(user1.date!='null')][['user_id','date_received','date']]
    t10['user_date_datereceived_gap'] = t10.date+':'+t10.date_received
    t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
    t10 = t10[['user_id','user_date_datereceived_gap']]
    # 将用户优惠券使用时间的间隔取平均值
    t11 = t10.groupby('user_id').agg('mean').reset_index()
    t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
    # 间隔天数的最小值
    t12 = t10.groupby('user_id').agg('min').reset_index()
    t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
    # 间隔天数的最大值
    t13 = t10.groupby('user_id').agg('max').reset_index()
    t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
    # 合并特征
    user1_feature = pd.merge(t,t1,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t3,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t4,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t5,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t6,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t7,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t8,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t9,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t11,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t12,on='user_id',how='left')
    user1_feature = pd.merge(user1_feature,t13,on='user_id',how='left')
    user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan,0)
    user1_feature.buy_user_coupon = user1_feature.buy_use_coupon.replace(np.nan,0)
    user1_feature['buy_use_coupon_rate'] = user1_feature.buy_use_coupon.astype('float')/user1_feature.buy_total.astype('float')
    user1_feature['user_coupon_transfer_rate'] = user1_feature.buy_use_coupon.astype('float')/user1_feature.buy_use_coupon.astype('float')
    user1_feature.buy_total = user1_feature.buy_total.replace(np.nan,0)
    user1_feature.coupon_received = user1_feature.coupon_received.replace(np.nan,0)
    user1_feature.to_csv('feature/user1_feature.csv',index=None)
    

      5、用户和商店之间联系的特征

    # 对数据集feature3进行处理
    
    # 用户和商店之间联系的特征
    all_user_merchant = feature3[['user_id','merchant_id']]
    all_user_merchant.drop_duplicates(inplace=True)
    # 只保留销售了商品的商户id
    t = feature3[['user_id','merchant_id','date']]
    t = t[t.date!='null'][['user_id','merchant_id']]
    # 用户一共买了特定商户多少商品
    t['user_merchant_buy_total'] = 1
    t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t.drop_duplicates(inplace=True)
    t1 = feature3[['user_id','merchant_id','coupon_id']]
    t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
    # 用户一共收到一个商户的优惠劵数目
    t['user_merchant_received'] = 1
    t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t1.drop_duplicates(inplace = True)
    
    t2 = feature3[['user_id','merchant_id','date','date_received']]
    t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
    # 用户在一家商户使用优惠券购买的商品数目
    t2['user_merchant_buy_use_coupon'] = 1
    t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t2.drop_duplicates(inplace = True)
    # 用户在一家商家的说有记录总数
    t3 = feature3[['user_id','merchant_id']]
    t3['user_merchant_any'] = 1
    t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t3.drop_duplicates(inplace=True)
    # 用户未使用优惠券购买的商品数目
    t4 = feature3[['user_id','merchant_id','date','coupon_id']]
    t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
    t4['user_merchant_buy_common'] = 1
    t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t4.drop_duplicates(inplace = True)
    user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left')
    # 都是针对一个商户和一个用户
    user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float')
    user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float')/user_merchant3.user_merchant_buy_total.astype('float')
    user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float')/user_merchant3.user_merchant_any.astype('float')
    user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float')/user_merchant3.user_merchant_buy_total.astype('float')
    user_merchant3.to_csv('feature/user_merchant3.csv',index=None)
    
    
    # 对于数据集feature2
    all_user_merchant = feature2[['user_id','merchant_id']]
    all_user_merchant.drop_duplicates(inplace=True)
    t = feature2[['user_id','merchant_id','date']]
    t = t[t.date!='null'][['user_id','merchant_id']]
    t['user_merchant_buy_total'] = 1
    t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t.drop_duplicates(inplace=True)
    t1 = feature2[['user_id','merchant_id','coupon_id']]
    t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
    t1['user_merchant_received'] = 1
    t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t1.drop_duplicates(inplace = True)
    
    t2 = feature2[['user_id','merchant_id','date','date_received']]
    t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
    t2['user_merchant_buy_use_coupon'] = 1
    t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t2.drop_duplicates(inplace=True)
    
    t3 = feature2[['user_id','merchant_id']]
    t3['user_merchant_any'] = 1
    t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t3.drop_duplicates(inplace=True)
    
    t4 = feature2[['user_id','merchant_id','date','coupon_id']]
    t4 = t4[(t4.date!='null')&(t4.coupon_id == 'null')][['user_id','merchant_id']]
    t4['user_merchant_buy_common'] = 1
    t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t4.drop_duplicates(inplace=True)
    
    user_merchant2 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
    user_merchant2 = pd.merge(user_merchant2,t1,on=['user_id','merchant_id'],how='left')
    user_merchant2 = pd.merge(user_merchant2,t2,on=['user_id','merchant_id'],how='left')
    user_merchant2 = pd.merge(user_merchant2,t3,on=['user_id','merchant_id'],how='left')
    user_merchant2 = pd.merge(user_merchant2,t4,on=['user_id','merchant_id'],how='left')
    
    user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(np.nan,0)
    user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(np.nan,0)
    
    user_merchant2['user_merchant_coupon_transfer_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float')/user_merchant2.user_merchant_received.astype('float')
    user_merchant2['user_merchant_coupon_buy_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float')/user_merchant2.user_merchant_buy_total.astype('float')
    user_merchant2['user_merchant_rate'] = user_merchant2.user_merchant_buy_total.astype('float')/user_merchant2.user_merchant_any.astype('float')
    user_merchant2['user_merchant_common_buy_rate'] = user_merchant2.user_merchant_buy_common.astype('float')/user_merchant2.user_merchant_buy_total.astype('float')
    user_merchant2.to_csv('feature/user_merchant2.csv',index=None)
    
    
    # 对于数据集feature1
    all_user_merchant = feature1[['user_id','merchant_id']]
    all_user_merchant.drop_duplicates(inplace=True)
    t = feature1[['user_id','merchant_id','date']]
    t = t[t.date!='null'][['user_id','merchant_id']]
    t['user_merchant_buy_total'] = 1
    t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t.drop_duplicates(inplace=True)
    t1 = feature1[['user_id','merchant_id','coupon_id']]
    t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
    t1['user_merchant_received'] = 1
    t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t1.drop_duplicates(inplace = True)
    
    t2 = feature1[['user_id','merchant_id','date','date_received']]
    t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
    t2['user_merchant_buy_use_coupon'] = 1
    t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t2.drop_duplicates(inplace=True)
    
    t3 = feature1[['user_id','merchant_id']]
    t3['user_merchant_any'] = 1
    t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t3.drop_duplicates(inplace=True)
    
    t4 = feature1[['user_id','merchant_id','date','coupon_id']]
    t4 = t4[(t4.date!='null')&(t4.coupon_id == 'null')][['user_id','merchant_id']]
    t4['user_merchant_buy_common'] = 1
    t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t4.drop_duplicates(inplace=True)
    
    user_merchant1 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
    user_merchant1 = pd.merge(user_merchant1,t1,on=['user_id','merchant_id'],how='left')
    user_merchant1 = pd.merge(user_merchant1,t2,on=['user_id','merchant_id'],how='left')
    user_merchant1 = pd.merge(user_merchant1,t3,on=['user_id','merchant_id'],how='left')
    user_merchant1 = pd.merge(user_merchant1,t4,on=['user_id','merchant_id'],how='left')
    
    user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(np.nan,0)
    user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(np.nan,0)
    
    user_merchant1['user_merchant_coupon_transfer_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float')/user_merchant1.user_merchant_received.astype('float')
    user_merchant1['user_merchant_coupon_buy_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float')/user_merchant1.user_merchant_buy_total.astype('float')
    user_merchant1['user_merchant_rate'] = user_merchant1.user_merchant_buy_total.astype('float')/user_merchant1.user_merchant_any.astype('float')
    user_merchant1['user_merchant_common_buy_rate'] = user_merchant1.user_merchant_buy_common.astype('float')/user_merchant1.user_merchant_buy_total.astype('float')
    user_merchant1.to_csv('feature/user_merchant1.csv',index=None)
    

      三、特征组合

    # dataset1,2,3分别是划分时间滑窗后的所提取的特征的组合,方便接下来的划分训练集和测试集
    # 此次合并后dataset1,2,3的特征类型是一样的,然后给dataset1,2添加标签,标签是get_label(s)这个函数所生成的
    def get_label(s):
        s = s.split(':')
        if s[0]=='null':
            return 0
        elif (dt.date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-dt.date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<15:
            return 1
        else:
            return -1
    
    coupon3 = pd.read_csv('feature/coupon3_feature.csv')
    merchant3 = pd.read_csv('feature/merchant3_feature.csv')
    user3 = pd.read_csv('feature/user3_feature.csv')
    other_feature3 = pd.read_csv('feature/other_feature3.csv')
    user_merchant3 = pd.read_csv('feature/user_merchant3.csv')
    dataset3 = pd.merge(coupon3,merchant3,on='merchant_id',how='left')
    dataset3 = pd.merge(dataset3,user3,on='user_id',how='left')
    dataset3 = pd.merge(dataset3,user_merchant3,on=['user_id','merchant_id'],how='left')
    dataset3 = pd.merge(dataset3,other_feature3,on=['user_id','coupon_id','date_received'],how='left')
    dataset3.drop_duplicates(inplace=True)
    
    dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(np.nan,0)
    dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan,0)
    dataset3.user_merchant_received = dataset3.user_merchant_received.replace(np.nan,0)
    dataset3['is_weekend'] = dataset3.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
    # get_dummies 进行one-hot编码
    weekday_dummies = pd.get_dummies(dataset3.day_of_week)
    weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
    dataset3 = pd.concat([dataset3,weekday_dummies],axis=1)
    # dataset3.columns
    dataset3.drop(['merchant_id','day_of_week','coupon_count'],axis=1,inplace=True)
    dataset3 = dataset3.replace('null',np.nan)
    dataset3.to_csv('dataset/dataset3.csv',index=None)
    
    coupon2 = pd.read_csv('feature/coupon2_feature.csv')
    merchant2 = pd.read_csv('feature/merchant2_feature.csv')
    user2 = pd.read_csv('feature/user2_feature.csv')
    user_merchant2 = pd.read_csv('feature/user_merchant2.csv')
    other_feature2 = pd.read_csv('feature/other_Feature2.csv')
    dataset2 = pd.merge(coupon2,merchant2,on='merchant_id',how='left')
    dataset2 = pd.merge(dataset2,user2,on='user_id',how='left')
    dataset2 = pd.merge(dataset2,user_merchant2,on=['user_id','merchant_id'],how='left')
    dataset2 = pd.merge(dataset2,other_feature2,on=['user_id','coupon_id','date_received'],how='left')
    dataset2.drop_duplicates(inplace=True)
    # dataset2.head()
    
    dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(np.nan,0)
    dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan,0)
    dataset2.user_merchant_received = dataset2.user_merchant_received.replace(np.nan,0)
    dataset2['is_weekend'] = dataset2.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
    weekday_dummies = pd.get_dummies(dataset2.day_of_week)
    weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
    dataset2 = pd.concat([dataset2,weekday_dummies],axis=1)
    dataset2['label'] = dataset2.date.astype('str') + ':' +  dataset2.date_received.astype('str')
    dataset2.label = dataset2.label.apply(get_label)
    dataset2.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True)
    dataset2 = dataset2.replace('null',np.nan)
    dataset2.to_csv('dataset/dataset2.csv',index=None)
    
    
    coupon1 = pd.read_csv('feature/coupon1_feature.csv')
    merchant1 = pd.read_csv('feature/merchant1_feature.csv')
    user1 = pd.read_csv('feature/user1_feature.csv')
    user_merchant1 = pd.read_csv('feature/user_merchant1.csv')
    other_feature1 = pd.read_csv('feature/other_feature1.csv')
    dataset1 = pd.merge(coupon1,merchant1,on='merchant_id',how='left')
    dataset1 = pd.merge(dataset1,user1,on='user_id',how='left')
    dataset1 = pd.merge(dataset1,user_merchant1,on=['user_id','merchant_id'],how='left')
    dataset1 = pd.merge(dataset1,other_feature1,on=['user_id','coupon_id','date_received'],how='left')
    dataset1.drop_duplicates(inplace=True)
    # print dataset1.shape
    
    dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(np.nan,0)
    dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan,0)
    dataset1.user_merchant_received = dataset1.user_merchant_received.replace(np.nan,0)
    dataset1['is_weekend'] = dataset1.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
    weekday_dummies = pd.get_dummies(dataset1.day_of_week)
    weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
    dataset1 = pd.concat([dataset1,weekday_dummies],axis=1)
    dataset1['label'] = dataset1.date.astype('str') + ':' +  dataset1.date_received.astype('str')
    dataset1.label = dataset1.label.apply(get_label)
    dataset1.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True)
    dataset1 = dataset1.replace('null',np.nan)
    dataset1.to_csv('dataset/dataset1.csv',index=None)
    

      四、模型训练

    import pandas as pd
    import xgboost as xgb
    from sklearn.preprocessing import MinMaxScaler
    
    dataset1 = pd.read_csv('dataset/dataset1.csv')
    dataset1.label.replace(-1,0,inplace=True)
    dataset2 = pd.read_csv('dataset/dataset2.csv')
    dataset2.label.replace(-1,0,inplace=True)
    dataset3 = pd.read_csv('dataset/dataset3.csv')
    # 去重
    dataset1.drop_duplicates(inplace=True)
    dataset2.drop_duplicates(inplace=True)
    dataset3.drop_duplicates(inplace=True)
    
    dataset12 = pd.concat([dataset1,dataset2],axis=0)
    # 再次组合成训练集
    dataset1_y = dataset1.label
    dataset1_x = dataset1.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)  # 'day_gap_before','day_gap_after' cause overfitting, 0.77
    dataset2_y = dataset2.label
    dataset2_x = dataset2.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
    dataset12_y = dataset12.label
    dataset12_x = dataset12.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
    dataset3_preds = dataset3[['user_id','coupon_id','date_received']]
    dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)
    # dataset3_x = dataset3.drop(['user_id','coupon_id','date_received'],axis=1)
    
    # print(dataset1_x.shape,dataset2_x.shape,dataset3_x.shape)
    
    dataset1 = xgb.DMatrix(dataset1_x,label=dataset1_y)
    dataset2 = xgb.DMatrix(dataset2_x,label=dataset2_y)
    dataset12= xgb.DMatrix(dataset12_x,label=dataset12_y)
    dataset3 = xgb.DMatrix(dataset3_x)
    # 在XGBoost中,要将处理的数据存储在对象DMatrix中,方便下一步处理

      对特征筛选,训练,方便除去对标签影响因子小的特征属性,即剪枝

    params={'booster':'gbtree',
    	    'objective': 'rank:pairwise',
    	    'eval_metric':'auc',
    	    'gamma':0.1,
    	    'min_child_weight':1.1,
    	    'max_depth':5,
    	    'lambda':10,
    	    'subsample':0.7,
    	    'colsample_bytree':0.7,
    	    'colsample_bylevel':0.7,
    	    'eta': 0.01,
    	    'tree_method':'exact',
    	    'seed':0,
    	    'nthread':12
    	    }
    watchlist = [(dataset12,'train')]
    # 模型训练 model = xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist) # 对dataset3进行预测 dataset3_preds['label'] = model.predict(dataset3) dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label.reshape(-1, 1)) dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True) dataset3_preds.to_csv("xgb_preds.csv",index=None,header=None) # print(dataset3_preds.describe()) # feature_score来保存特征对标签的影响因子 feature_score = model.get_fscore() feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True) fs = [] for (key,value) in feature_score: fs.append("{0},{1} ".format(key,value)) with open('xgb_feature_score.csv','w') as f: f.writelines("feature,score ") f.writelines(fs)

      

      五、总结

      本次按着大佬的思路做了一次,感觉对自己的提升挺大的,学会了好多东西,总结下自己这段时间的工作吧,弄懂了一个完整的数据处理过程到底在干什么、怎么弄、以及锻炼了自己敲代码的能力,个人感觉真是在课本学习和实际操作过程中差距还是挺大的,实际操作下,学东西会更快,以后要多多参加这种竞赛,看看别人的想法,我这算是入门级的了。

      六、附录

      这些代码都在Jupyter Notebook上完美运行,所涉及的知识点也做了一些笔记整理,具体详见我的其他随笔。

      最后,感谢第一名大佬提供的源码。

  • 相关阅读:
    时间选择器和日期选择器
    paip.c++ qt 项目工程互相引用的方法
    leetcode_question_85 Largest Rectangle in Histogram
    在VirtualBox虚拟机上采集Fedora15系统
    Oracle
    VC6.0调试大全
    oracle中的exists 和not exists 用法详解
    vi常用命令
    【虚拟化实战】容灾设计之四VPLEX
    CentOS6.3 安装配置 ant
  • 原文地址:https://www.cnblogs.com/lvpengbo/p/8822214.html
Copyright © 2020-2023  润新知