4 构造训练集/测试集
- 标签,采用滑动窗口的方式,构造训练集的时候针对产生购买的行为标记为1
- 整合特征
def get_labels(start_date, end_date, all_actions): actions = get_actions(start_date, end_date, all_actions) # actions = actions[actions['type'] == 4] # 修改为预测购买了商品8的用户预测 actions = actions[(actions['type'] == 4) & (actions['cate']==8)] actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum() actions['label'] = 1 actions = actions[['user_id', 'sku_id', 'label']] return actions
train_start_date = '2016-03-01' train_actions = None all_actions = get_all_action() print ("get all actions!")
get all actions!
all_actions.head()
all_actions.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 34456272 entries, 0 to 11485423 Data columns (total 7 columns): user_id float32 sku_id float32 time object model_id float32 type float32 cate float32 brand float32 dtypes: float32(6), object(1) memory usage: 1.3+ GB
all_actions.shape
(34456272, 7)
user = get_basic_user_feat() print ('get_basic_user_feat finsihed')
get_basic_user_feat finsihed
user.head()
product = get_basic_product_feat() print ('get_basic_product_feat finsihed')
get_basic_product_feat finsihed
product.head()
train_start_date = '2016-03-01' train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3) train_end_date
datetime.datetime(2016, 3, 4, 0, 0)
train_end_date = train_end_date.strftime('%Y-%m-%d') # 修正prod_acc,cate_acc的时间跨度 start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30) start_days = start_days.strftime('%Y-%m-%d') print (train_end_date)
2016-03-04
start_days
'2016-02-03'
4.1 构造训练集
def make_actions(user, product, all_actions, train_start_date): train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3) train_end_date = train_end_date.strftime('%Y-%m-%d') # 修正prod_acc,cate_acc的时间跨度 start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30) start_days = start_days.strftime('%Y-%m-%d') print (train_end_date) user_acc = get_recent_user_feat(train_end_date, all_actions) print ('get_recent_user_feat finsihed') user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions) print ('get_user_cate_feature finished') product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions) print ('get_accumulate_product_feat finsihed') cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions) print ('get_accumulate_cate_feat finsihed') comment_acc = get_comments_product_feat(train_end_date) print ('get_comments_product_feat finished') # 标记 test_start_date = train_end_date test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5) test_end_date = test_end_date.strftime('%Y-%m-%d') labels = get_labels(test_start_date, test_end_date, all_actions) print ("get labels") actions = None for i in (3, 5, 7, 10, 15, 21, 30): start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) start_days = start_days.strftime('%Y-%m-%d') if actions is None: actions = get_action_feat(start_days, train_end_date, all_actions, i) else: # 注意这里的拼接key actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left', on=['user_id', 'sku_id', 'cate']) actions = pd.merge(actions, user, how='left', on='user_id') actions = pd.merge(actions, user_acc, how='left', on='user_id') user_cate.index.name = "" actions = pd.merge(actions, user_cate, how='left', on='user_id') # 注意这里的拼接key actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate']) actions = pd.merge(actions, product_acc, how='left', on='sku_id') actions = pd.merge(actions, cate_acc, how='left', on='cate') actions = pd.merge(actions, comment_acc, how='left', on='sku_id') actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id']) # 主要是填充拼接商品基本特征、评论特征、标签之后的空值 actions = actions.fillna(0) # return actions # 采样 action_postive = actions[actions['label'] == 1] action_negative = actions[actions['label'] == 0] del actions neg_len = len(action_postive) * 10 action_negative = action_negative.sample(n=neg_len) action_sample = pd.concat([action_postive, action_negative], ignore_index=True) return action_sample
def make_train_set(train_start_date, setNums ,f_path, all_actions): train_actions = None #all_actions = get_all_action() #print ("get all actions!") user = get_basic_user_feat() print ('get_basic_user_feat finsihed') product = get_basic_product_feat() print ('get_basic_product_feat finsihed') # 滑窗,构造多组训练集/验证集 for i in range(setNums): print (train_start_date) if train_actions is None: train_actions = make_actions(user, product, all_actions, train_start_date) else: train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)], ignore_index=True) # 接下来每次移动一天 train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1) train_start_date = train_start_date.strftime('%Y-%m-%d') print ("round {0}/{1} over!".format(i+1, setNums)) train_actions.to_csv(f_path, index=False)
train_start_date = '2016-02-01' train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3) train_end_date train_end_date = train_end_date.strftime('%Y-%m-%d') # 修正prod_acc,cate_acc的时间跨度 start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30) start_days = start_days.strftime('%Y-%m-%d') print (train_end_date)
2016-02-04
user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions) print ('get_user_cate_feature finished')
get_user_cate_feature finished
product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions) print ('get_accumulate_product_feat finsihed')
get_accumulate_product_feat finsihed
cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions) print ('get_accumulate_cate_feat finsihed')
get_accumulate_cate_feat finsihed
# 训练集 train_start_date = '2016-02-01' make_train_set(train_start_date, 20, 'train_set.csv',all_actions)
get_basic_user_feat finsihed get_basic_product_feat finsihed 2016-02-01 2016-02-04
4.2 构造验证集(线下测试集)
def make_test_set(train_start_date, train_end_date): start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30) start_days = start_days.strftime('%Y-%m-%d') all_actions = get_all_action() print ("get all actions!") user = get_basic_user_feat() print ('get_basic_user_feat finsihed') product = get_basic_product_feat() print ('get_basic_product_feat finsihed') user_acc = get_recent_user_feat(train_end_date, all_actions) print ('get_accumulate_user_feat finsihed') user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions) print ('get_user_cate_feature finished') product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions) print ('get_accumulate_product_feat finsihed') cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions) print ('get_accumulate_cate_feat finsihed') comment_acc = get_comments_product_feat(train_end_date) actions = None for i in (3, 5, 7, 10, 15, 21, 30): start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) start_days = start_days.strftime('%Y-%m-%d') if actions is None: actions = get_action_feat(start_days, train_end_date, all_actions,i) else: actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left', on=['user_id', 'sku_id', 'cate']) actions = pd.merge(actions, user, how='left', on='user_id') actions = pd.merge(actions, user_acc, how='left', on='user_id') user_cate.index.name = "" actions = pd.merge(actions, user_cate, how='left', on='user_id') # 注意这里的拼接key actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate']) actions = pd.merge(actions, product_acc, how='left', on='sku_id') actions = pd.merge(actions, cate_acc, how='left', on='cate') actions = pd.merge(actions, comment_acc, how='left', on='sku_id') actions = actions.fillna(0) actions.to_csv("test_set.csv", index=False)
make_val_set('2016-02-23', '2016-02-26', 'val_3.csv')
4.3 构造测试集
def make_test_set(train_start_date, train_end_date): start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30) start_days = start_days.strftime('%Y-%m-%d') all_actions = get_all_action() print ("get all actions!") user = get_basic_user_feat() print ('get_basic_user_feat finsihed') product = get_basic_product_feat() print ('get_basic_product_feat finsihed') user_acc = get_recent_user_feat(train_end_date, all_actions) print ('get_accumulate_user_feat finsihed') user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions) print ('get_user_cate_feature finished') product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions) print ('get_accumulate_product_feat finsihed') cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions) print ('get_accumulate_cate_feat finsihed') comment_acc = get_comments_product_feat(train_end_date) actions = None for i in (3, 5, 7, 10, 15, 21, 30): start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) start_days = start_days.strftime('%Y-%m-%d') if actions is None: actions = get_action_feat(start_days, train_end_date, all_actions,i) else: actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left', on=['user_id', 'sku_id', 'cate']) actions = pd.merge(actions, user, how='left', on='user_id') actions = pd.merge(actions, user_acc, how='left', on='user_id') actions = pd.merge(actions, user_cate, how='left', on='user_id') # 注意这里的拼接key actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate']) actions = pd.merge(actions, product_acc, how='left', on='sku_id') actions = pd.merge(actions, cate_acc, how='left', on='cate') actions = pd.merge(actions, comment_acc, how='left', on='sku_id') actions = actions.fillna(0) actions.to_csv("test_set.csv", index=False)