def calc_score_median(sample_set, var):
'''
计算相邻评分的中位数,以便进行决策树二元切分
param sample_set: 待切分样本
param var: 分割变量名称
'''
var_list = list(np.unique(sample_set[var]))
var_median_list = []
for i in range(len(var_list)-1):
var_median = (var_list[i]+var_list[i+1])/2
var_median_list.append(var_median)
return var_median_list
def choose_best_split(sample_set, var, min_sample):
'''
使用CART分类决策树选择最好的样本切分点
返回切分点
param sample_set: 待切分样本
param var: 分割变量名称
param min_sample: 待切分样本的最小样本量(限制条件)
'''
#根据样本评分计算相邻不同分数的中间值
score_median_list = calc_score_median(sample_set, var)
median_len = len(score_median_list)
sample_cnt = sample_set.shape[0]
sample1_cnt = sum(sample_set["target"])
sample0_cnt = sample_cnt - sample1_cnt
Gini = 1-np.square(sample1_cnt/sample_cnt)-np.square(sample0_cnt/sample_cnt)
bestGini = 0.0; bestSplit_point = 0.0; bestSplit_position = 0.0
for i in range(median_len):
left = sample_set[sample_set[var]<score_median_list[i]]
right = sample_set[sample_set[var]>score_median_list[i]]
left_cnt = left.shape[0]; right_cnt = right.shape[0]
left1_cnt = sum(left["target"]); right1_cnt = sum(right["target"])
left0_cnt = left_cnt - left1_cnt; right0_cnt = right_cnt - right1_cnt
left_ratio = left_cnt/sample_cnt; right_ratio = right_cnt/sample_cnt
if left_cnt<min_sample or right_cnt<min_sample:
continue
Gini_left = 1-np.square(left1_cnt/left_cnt) - np.square(left0_cnt/left_cnt)
Gini_right = 1- np.square(right1_cnt/right_cnt) - np.square(right0_cnt/left_cnt)
Gini_temp = Gini - (left_ratio*Gini_left + right_ratio*Gini_right)
if Gini_temp > bestGini:
bestGini = Gini_temp; bestSplit_point = score_median_list[i]
if median_len>1:
bestSplit_position = i/(median_len-1)
else:
bestSplit_position = i/median_len
else:
continue
Gini = Gini-bestGini
return bestSplit_point, bestSplit_position
def bining_data_split(sample_set, var, min_sample, split_list):
"""
划分数据找到最优分割点list
param sample_set: 待切分样本
param var: 分割变量名称
param min_sample: 待切分样本的最小样本量(限制条件)
param split_list: 最优分割点list
"""
split, position = choose_best_split(sample_set, var, min_sample)
if split !=0.0:
split_list.append(split)
# 根据分割点划分数据集,继续进行划分
sample_set_left = sample_set[sample_set[var] < split]
sample_set_right = sample_set[sample_set[var] >split]
# 如果左子树样本量超过2倍最小样本量,且分割点不是第一个分割点,则切分左子树
if len(sample_set_left) >= min_sample*2 and position not in [0.0,1.0]:
bining_data_split(sample_set_left, var, min_sample, split_list)
else:
None
# 如果右子树样本量超过2倍最小样本量,且分割点不是最后一个分割点,则切分右子树
if len(sample_set_right) >= min_sample*2 and position not in [0.0,1.0]:
bining_data_split(sample_set_right, var, min_sample, split_list)
else:
None
def get_bestsplit_list(sample_set, var):
"""
根据分箱得到最优分割点list
param sample_set: 待切分样本
param var: 分割变量名称
"""
# 计算最小样本阈值(终止条件)
min_df = sample_set.shape[0]*0.05
split_list = []
# 计算第一个和最后一个分割点
bining_data_split(sample_set, var, min_df, split_list)
return split_list
def bining_plot(sample,vars_name):
try:
# sample_df,name
# sample =sample_df
data = sample.copy()
split_list = get_bestsplit_list(data, vars_name)
split_l = [-np.inf]
split_l.extend(split_list)
split_l.append(np.inf)
data[f"{vars_name}_bin"] = pd.cut(sample[vars_name],bins=split_l)
data[f"{vars_name}_bin"] = data[f"{vars_name}_bin"].cat.add_categories(['null'])
data[f"{vars_name}_bin"].fillna("null",inplace=True)
data_count = pd.DataFrame(data.groupby(by = ['{}_bin'.format(vars_name)])['real_order_id'].count())
data_count.columns = ['订单数']
#放款数
data_loan = pd.DataFrame(data.groupby(by = ['{}_bin'.format(vars_name)])['loan_flag'].sum())
data_loan.columns = ['放款数']
#逾期数
data_overdue = pd.DataFrame(data.groupby(by = ['{}_bin'.format(vars_name)]).apply(lambda x : (x['status']==3).sum()))
data_overdue.columns = ['逾期数']
data_stat = pd.concat([data_count,data_loan,data_overdue],axis=1)
data_stat.reset_index(inplace=True)
data_stat.columns = ['bins','订单数','放款数','逾期数']
data_stat = pd.concat([data_stat[data_stat.bins=='null'],data_stat[data_stat.bins!='null']],axis=0)
#print(data_stat)
data_stat.reset_index(inplace=True,drop=True)
data_stat['订单占比'] = data_stat['订单数']/data_stat['订单数'].sum()
data_stat['放款率'] = data_stat['放款数']/data_stat['订单数']
data_stat['逾期率'] = data_stat['逾期数']/data_stat['放款数']
data_stat['逾期率'].fillna(0,inplace=True)
plt.figure(figsize=(15,8))
plt.plot([ i for i in range(data_stat.shape[0])],data_stat['逾期率'])
plt.bar([ i for i in range(data_stat.shape[0])],data_stat['订单占比'],color = 'orange')
plt.xticks([ i for i in range(data_stat.shape[0]+1)],data_stat['bins'],rotation=50)
data_stat['订单占比'] = data_stat['订单占比'].map(lambda x :str(np.round(x*100,2))+'%')
data_stat['放款率'] = data_stat['放款率'].map(lambda x :str(np.round(x*100,2))+'%')
data_stat['逾期率'] = data_stat['逾期率'].map(lambda x :str(np.round(x*100,2))+'%')
print(tabulate(data_stat,headers=data_stat.columns,tablefmt='grid'))
plt.show()
except:
error_list.append(vars_name)