• 数据挖掘实践(35):实战--高潜用户购买画像(四)构造训练集/测试集


    4 构造训练集/测试集

    • 标签,采用滑动窗口的方式,构造训练集的时候针对产生购买的行为标记为1
    • 整合特征
    def get_labels(start_date, end_date, all_actions):
        actions = get_actions(start_date, end_date, all_actions)
    #     actions = actions[actions['type'] == 4]
        # 修改为预测购买了商品8的用户预测
        actions = actions[(actions['type'] == 4) & (actions['cate']==8)]
        
        actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
        actions['label'] = 1
        actions = actions[['user_id', 'sku_id', 'label']]
        return actions
    train_start_date = '2016-03-01'
    train_actions = None
    all_actions = get_all_action()
    print ("get all actions!")
    get all actions!
    all_actions.head()

    all_actions.info()
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 34456272 entries, 0 to 11485423
    Data columns (total 7 columns):
    user_id     float32
    sku_id      float32
    time        object
    model_id    float32
    type        float32
    cate        float32
    brand       float32
    dtypes: float32(6), object(1)
    memory usage: 1.3+ GB
    all_actions.shape
    (34456272, 7)
    user = get_basic_user_feat()
    print ('get_basic_user_feat finsihed')
    get_basic_user_feat finsihed
    user.head()

    product = get_basic_product_feat()
    print ('get_basic_product_feat finsihed')
    get_basic_product_feat finsihed
    product.head()

    train_start_date = '2016-03-01'
    train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)
    train_end_date
    datetime.datetime(2016, 3, 4, 0, 0)
    train_end_date = train_end_date.strftime('%Y-%m-%d')
    # 修正prod_acc,cate_acc的时间跨度
    start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)
    start_days = start_days.strftime('%Y-%m-%d')
    print (train_end_date)
    2016-03-04
    start_days
    '2016-02-03'

    4.1 构造训练集

    def make_actions(user, product, all_actions, train_start_date):
        train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)
        train_end_date = train_end_date.strftime('%Y-%m-%d')
        # 修正prod_acc,cate_acc的时间跨度
        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)
        start_days = start_days.strftime('%Y-%m-%d')
        print (train_end_date)
        user_acc = get_recent_user_feat(train_end_date, all_actions)
        print ('get_recent_user_feat finsihed')
        
        user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)
        print ('get_user_cate_feature finished')
        
        product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)
        print ('get_accumulate_product_feat finsihed')
        cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)
        print ('get_accumulate_cate_feat finsihed')
        comment_acc = get_comments_product_feat(train_end_date)
        print ('get_comments_product_feat finished')
        # 标记
        test_start_date = train_end_date
        test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)
        test_end_date = test_end_date.strftime('%Y-%m-%d')
        labels = get_labels(test_start_date, test_end_date, all_actions)
        print ("get labels")
        
        actions = None
        for i in (3, 5, 7, 10, 15, 21, 30):
            start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
            start_days = start_days.strftime('%Y-%m-%d')
            if actions is None:
                actions = get_action_feat(start_days, train_end_date, all_actions, i)
            else:
                # 注意这里的拼接key
                actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left',
                                   on=['user_id', 'sku_id', 'cate'])
    
        actions = pd.merge(actions, user, how='left', on='user_id')
        actions = pd.merge(actions, user_acc, how='left', on='user_id')
        user_cate.index.name = ""
        actions = pd.merge(actions, user_cate, how='left', on='user_id')
        # 注意这里的拼接key
        actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])
        actions = pd.merge(actions, product_acc, how='left', on='sku_id')
        actions = pd.merge(actions, cate_acc, how='left', on='cate')
        actions = pd.merge(actions, comment_acc, how='left', on='sku_id')
        actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])
        # 主要是填充拼接商品基本特征、评论特征、标签之后的空值
        actions = actions.fillna(0)
    #     return actions
        # 采样
        action_postive = actions[actions['label'] == 1]
        action_negative = actions[actions['label'] == 0]
        del actions
        neg_len = len(action_postive) * 10
        action_negative = action_negative.sample(n=neg_len)
        action_sample = pd.concat([action_postive, action_negative], ignore_index=True)    
        
        return action_sample
    def make_train_set(train_start_date, setNums ,f_path, all_actions):
        train_actions = None
        #all_actions = get_all_action()
        #print ("get all actions!")
        user = get_basic_user_feat()
        print ('get_basic_user_feat finsihed')
        product = get_basic_product_feat()
        print ('get_basic_product_feat finsihed')
        # 滑窗,构造多组训练集/验证集
        for i in range(setNums):
            print (train_start_date)
            if train_actions is None:
                train_actions = make_actions(user, product, all_actions, train_start_date)
            else:
                train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)],
                                              ignore_index=True)
            # 接下来每次移动一天
            train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1)
            train_start_date = train_start_date.strftime('%Y-%m-%d')
            print ("round {0}/{1} over!".format(i+1, setNums))
    
        train_actions.to_csv(f_path, index=False)
    train_start_date = '2016-02-01'
    train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)
    train_end_date
    
    train_end_date = train_end_date.strftime('%Y-%m-%d')
    # 修正prod_acc,cate_acc的时间跨度
    start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)
    start_days = start_days.strftime('%Y-%m-%d')
    print (train_end_date)
    2016-02-04
    user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)
    print ('get_user_cate_feature finished')
    get_user_cate_feature finished
    product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)
    print ('get_accumulate_product_feat finsihed')
    get_accumulate_product_feat finsihed
    cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)
    print ('get_accumulate_cate_feat finsihed')
    get_accumulate_cate_feat finsihed
    # 训练集
    train_start_date = '2016-02-01'
    make_train_set(train_start_date, 20, 'train_set.csv',all_actions)
    get_basic_user_feat finsihed
    get_basic_product_feat finsihed
    2016-02-01
    2016-02-04

    4.2 构造验证集(线下测试集)

    def make_test_set(train_start_date, train_end_date):
        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)
        start_days = start_days.strftime('%Y-%m-%d')
        all_actions = get_all_action()
        print ("get all actions!")
        user = get_basic_user_feat()
        print ('get_basic_user_feat finsihed')
        product = get_basic_product_feat()
        print ('get_basic_product_feat finsihed')
        
        user_acc = get_recent_user_feat(train_end_date, all_actions)
        print ('get_accumulate_user_feat finsihed')
        
        user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)
        print ('get_user_cate_feature finished')
        
        product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)
        print ('get_accumulate_product_feat finsihed')
        cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)
        print ('get_accumulate_cate_feat finsihed')
        comment_acc = get_comments_product_feat(train_end_date)
    
        actions = None
        for i in (3, 5, 7, 10, 15, 21, 30):
            start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
            start_days = start_days.strftime('%Y-%m-%d')
            if actions is None:
                actions = get_action_feat(start_days, train_end_date, all_actions,i)
            else:
                actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',
                                   on=['user_id', 'sku_id', 'cate'])
    
        actions = pd.merge(actions, user, how='left', on='user_id')
        actions = pd.merge(actions, user_acc, how='left', on='user_id')
        user_cate.index.name = ""
        actions = pd.merge(actions, user_cate, how='left', on='user_id')
        # 注意这里的拼接key
        actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])
        actions = pd.merge(actions, product_acc, how='left', on='sku_id')
        actions = pd.merge(actions, cate_acc, how='left', on='cate')
        actions = pd.merge(actions, comment_acc, how='left', on='sku_id')
    
        actions = actions.fillna(0)
        
    
        actions.to_csv("test_set.csv", index=False)
        
    make_val_set('2016-02-23', '2016-02-26', 'val_3.csv')

    4.3 构造测试集

    def make_test_set(train_start_date, train_end_date):
        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)
        start_days = start_days.strftime('%Y-%m-%d')
        all_actions = get_all_action()
        print ("get all actions!")
        user = get_basic_user_feat()
        print ('get_basic_user_feat finsihed')
        product = get_basic_product_feat()
        print ('get_basic_product_feat finsihed')
        
        user_acc = get_recent_user_feat(train_end_date, all_actions)
        print ('get_accumulate_user_feat finsihed')
        
        user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)
        print ('get_user_cate_feature finished')
        
        product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)
        print ('get_accumulate_product_feat finsihed')
        cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)
        print ('get_accumulate_cate_feat finsihed')
        comment_acc = get_comments_product_feat(train_end_date)
    
        actions = None
        for i in (3, 5, 7, 10, 15, 21, 30):
            start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
            start_days = start_days.strftime('%Y-%m-%d')
            if actions is None:
                actions = get_action_feat(start_days, train_end_date, all_actions,i)
            else:
                actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',
                                   on=['user_id', 'sku_id', 'cate'])
    
        actions = pd.merge(actions, user, how='left', on='user_id')
        actions = pd.merge(actions, user_acc, how='left', on='user_id')
        actions = pd.merge(actions, user_cate, how='left', on='user_id')
        # 注意这里的拼接key
        actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])
        actions = pd.merge(actions, product_acc, how='left', on='sku_id')
        actions = pd.merge(actions, cate_acc, how='left', on='cate')
        actions = pd.merge(actions, comment_acc, how='left', on='sku_id')
    
        actions = actions.fillna(0)
        
    
        actions.to_csv("test_set.csv", index=False)
  • 相关阅读:
    基本算法2
    基本算法
    读书笔记 《跟老齐学python》
    python杂记 20200207 离线安装 正则
    傻傻分不清之 Cookie、Session、Token、JWT 转载:https://juejin.im/post/5e055d9ef265da33997a42cc
    并发相关 杂记
    原根
    POJ2749 Building roads
    luogu P4735 最大异或和
    SP913 QTREE2
  • 原文地址:https://www.cnblogs.com/qiu-hua/p/14400887.html
Copyright © 2020-2023  润新知