• Gini分箱


    
    def calc_score_median(sample_set, var):
        '''
        计算相邻评分的中位数,以便进行决策树二元切分
        param sample_set: 待切分样本
        param var: 分割变量名称
        '''
        var_list = list(np.unique(sample_set[var]))
        var_median_list = []
        for i in range(len(var_list)-1):
            var_median = (var_list[i]+var_list[i+1])/2
            var_median_list.append(var_median)
        return var_median_list
    
    def choose_best_split(sample_set, var, min_sample):
        '''
        使用CART分类决策树选择最好的样本切分点
        返回切分点
        param sample_set: 待切分样本
        param var: 分割变量名称
        param min_sample: 待切分样本的最小样本量(限制条件)
        '''
    #根据样本评分计算相邻不同分数的中间值
        score_median_list = calc_score_median(sample_set, var)
        median_len = len(score_median_list)
        sample_cnt = sample_set.shape[0]
        sample1_cnt = sum(sample_set["target"])
        sample0_cnt = sample_cnt - sample1_cnt
        Gini = 1-np.square(sample1_cnt/sample_cnt)-np.square(sample0_cnt/sample_cnt)
        bestGini = 0.0; bestSplit_point = 0.0; bestSplit_position = 0.0
        for i in range(median_len):
            left = sample_set[sample_set[var]<score_median_list[i]]
            right = sample_set[sample_set[var]>score_median_list[i]]
            left_cnt = left.shape[0]; right_cnt = right.shape[0]
    
            left1_cnt = sum(left["target"]); right1_cnt = sum(right["target"])
            left0_cnt = left_cnt - left1_cnt; right0_cnt = right_cnt - right1_cnt
            left_ratio = left_cnt/sample_cnt; right_ratio = right_cnt/sample_cnt
            if left_cnt<min_sample or right_cnt<min_sample:
                continue
            
            Gini_left = 1-np.square(left1_cnt/left_cnt) - np.square(left0_cnt/left_cnt)
            Gini_right = 1- np.square(right1_cnt/right_cnt) - np.square(right0_cnt/left_cnt)
            Gini_temp = Gini - (left_ratio*Gini_left + right_ratio*Gini_right)
            if Gini_temp > bestGini:
                bestGini = Gini_temp; bestSplit_point = score_median_list[i]
                if median_len>1:
                    bestSplit_position = i/(median_len-1)
                else:
                    bestSplit_position = i/median_len
            else:
                continue
        Gini = Gini-bestGini
        return bestSplit_point, bestSplit_position
            
    
    def bining_data_split(sample_set, var, min_sample, split_list):
        """
        划分数据找到最优分割点list
        param sample_set: 待切分样本
        param var: 分割变量名称
        param min_sample: 待切分样本的最小样本量(限制条件)
        param split_list: 最优分割点list    
        """
        split, position = choose_best_split(sample_set, var, min_sample)
        if split !=0.0:
            split_list.append(split)
    #     根据分割点划分数据集,继续进行划分
        sample_set_left = sample_set[sample_set[var] < split]
        sample_set_right = sample_set[sample_set[var] >split]
    #     如果左子树样本量超过2倍最小样本量,且分割点不是第一个分割点,则切分左子树
        if len(sample_set_left) >= min_sample*2 and position not in [0.0,1.0]:
            bining_data_split(sample_set_left, var, min_sample, split_list)
        else:
            None
    #     如果右子树样本量超过2倍最小样本量,且分割点不是最后一个分割点,则切分右子树
        if len(sample_set_right) >= min_sample*2 and position not in [0.0,1.0]:
            bining_data_split(sample_set_right, var, min_sample, split_list)
        else:
            None
    
    
    def get_bestsplit_list(sample_set, var):
        """
        根据分箱得到最优分割点list
        param sample_set: 待切分样本
        param var: 分割变量名称    
        """
    #     计算最小样本阈值(终止条件)
        min_df = sample_set.shape[0]*0.05
        split_list = []
    #     计算第一个和最后一个分割点
        bining_data_split(sample_set, var, min_df, split_list)
        return split_list
    
    
    
    def bining_plot(sample,vars_name):
        try:
    #        sample_df,name
    #        sample =sample_df
            
            data = sample.copy()
            split_list = get_bestsplit_list(data, vars_name)
            split_l = [-np.inf]
            split_l.extend(split_list)
            split_l.append(np.inf)
            data[f"{vars_name}_bin"] = pd.cut(sample[vars_name],bins=split_l)
            data[f"{vars_name}_bin"] = data[f"{vars_name}_bin"].cat.add_categories(['null'])
            data[f"{vars_name}_bin"].fillna("null",inplace=True)
            
            data_count = pd.DataFrame(data.groupby(by = ['{}_bin'.format(vars_name)])['real_order_id'].count())
            data_count.columns = ['订单数']
            #放款数
            data_loan = pd.DataFrame(data.groupby(by = ['{}_bin'.format(vars_name)])['loan_flag'].sum())
            data_loan.columns = ['放款数']
            #逾期数
            data_overdue = pd.DataFrame(data.groupby(by = ['{}_bin'.format(vars_name)]).apply(lambda x : (x['status']==3).sum()))
            data_overdue.columns = ['逾期数']
            data_stat = pd.concat([data_count,data_loan,data_overdue],axis=1)
            data_stat.reset_index(inplace=True)
            data_stat.columns = ['bins','订单数','放款数','逾期数']
            data_stat = pd.concat([data_stat[data_stat.bins=='null'],data_stat[data_stat.bins!='null']],axis=0)
            #print(data_stat)
            data_stat.reset_index(inplace=True,drop=True)
            data_stat['订单占比'] = data_stat['订单数']/data_stat['订单数'].sum()
            data_stat['放款率'] = data_stat['放款数']/data_stat['订单数']
            data_stat['逾期率'] = data_stat['逾期数']/data_stat['放款数']
            data_stat['逾期率'].fillna(0,inplace=True)
            plt.figure(figsize=(15,8))
            plt.plot([ i for i in range(data_stat.shape[0])],data_stat['逾期率'])
            plt.bar([ i for i in range(data_stat.shape[0])],data_stat['订单占比'],color = 'orange')
            plt.xticks([ i for i in range(data_stat.shape[0]+1)],data_stat['bins'],rotation=50)
            data_stat['订单占比'] = data_stat['订单占比'].map(lambda x :str(np.round(x*100,2))+'%')
            data_stat['放款率'] = data_stat['放款率'].map(lambda x :str(np.round(x*100,2))+'%')
            data_stat['逾期率'] = data_stat['逾期率'].map(lambda x :str(np.round(x*100,2))+'%')
            print(tabulate(data_stat,headers=data_stat.columns,tablefmt='grid'))
            plt.show()
        except:
            error_list.append(vars_name)
    
    
    
  • 相关阅读:
    深度学习中常见问题
    freespace
    跑superpixel的程序
    python处理图片的一些操作
    跑edgebox
    tensorflow安装
    matlab启动
    stixel 理解
    stixel-world跑在kitti数据集
    小议中国人的乡土情结
  • 原文地址:https://www.cnblogs.com/lky520hs/p/11214986.html
Copyright © 2020-2023  润新知