• psi计算


    基础概念:https://zhuanlan.zhihu.com/p/344754828

    import sys
    import pandas as pd
    import numpy as np
    import math
    
    
    # all_list = []
    # df = pd.DataFrame(columns = ['date', 'data'])
    # counter = 0
    # for line in sys.stdin:
    #     line = line.strip('\r\n')
    #     df.loc[counter] = line.split(',')
    #     counter += 1
    # df.to_excel('./output2.xlsx', sheet_name='Sheet1')
    
    def calc_psi(dataframe):
        date_col = dataframe['date']
        data_col = dataframe['data']
        # print(dataframe['data'].max(), dataframe['data'].min())
        
        # 分组
        result = pd.qcut(dataframe['data'], 10)
        # print(result)
        dataframe["group"] = result
        group = result.drop_duplicates(inplace=False).sort_values(ascending=True)
        group.index = range(1, len(group) + 1)
        print(group)
        print('group 1:', group[1])
    
        # 日期
        from datetime import datetime
        months = date_col.apply(lambda x: str(datetime.strptime(str(x), '%Y%m%d').date().year)+str(datetime.strptime(str(x), '%Y%m%d').date().month))
        months_drop = months.drop_duplicates(inplace=False)
        print(months_drop.values)
    
        dataframe['new_date'] = months
        print(dataframe)
    
        # 先取月份
        # print((months == months_drop.values[1]) and (dataframe['group'] == group[1]))
        # 取分组
        # print(dataframe['group'])
        # print('group 11:', group[1])
        ################ 9月数据在第一组的数量/9月总数 ###########
        # print(dataframe['group'] == (group[1]))
        # print((months == months_drop.values[1]))
        # print('ggg:', (months == months_drop.values[1]) & (dataframe['group'] == group[1]))
        actual_i = []
        psi_indexs = []
        for month_idx in range(0, 7):
            group_i = []
            for group_index in range(1, 11):
                dec_group_i = dataframe[(months == months_drop.values[month_idx]) & (dataframe['group'] == group[group_index])]
                # print(dec_group_i.size, dataframe[months == months_drop.values[month_idx]].size)
                group_i.append(dec_group_i.size / dataframe[months == months_drop.values[month_idx]].size)
            print('group_i:', group_i)
            # 计算index = (实际占比 - 预期占比)* ln(实际占比 / 预期占比) 
            actual_i.append(group_i)
            if month_idx > 0:
                actual_ = np.array(actual_i[month_idx])
                except_ = np.array(actual_i[month_idx - 1])
                psi_index = (actual_ - except_) * np.log(actual_ / except_)
                psi_indexs.append(psi_index)
                print('psi_index:', psi_index)
    
        print(actual_i)
        # print(psi_indexs) 
        final_index = [np.sum(psi_index) for psi_index in psi_indexs]
        print('final_index: ', final_index)   
        
        # data_202109 = dataframe[(months == months_drop.values[1]) & (dataframe['group'] == group[1])]
        # print(data_202109.size, dataframe[(months == months_drop.values[1])].size)
        # group2 = dataframe[(months == months_drop.values[1]) & (dataframe['group'] == group[2])]
        # print(group2.size)
    
    
    # date_drop = data.drop_duplicates(subset=['date', 'date'], keep='first', inplace=False)
    data = pd.read_csv('./lyx_data.txt', sep=',', names=['date', 'data'])
    # print(data.head())
    calc_psi(data)
    
    
    # 生成数据
    # import numpy as np
    # date = ['20210812', '20210922', '20211009', '20211102', '20211202', '20220112', '20220202']
    # new_date = []
    # for i in range(100):
    #     new_date.extend(date)
    # new_data = []
    # for i in range(700):
    #     new_data.append(np.random.random(1))
    
    # data = pd.DataFrame(columns=['date', 'data'])
    # data['date'] = np.array(new_date)
    # data['data'] = np.array(new_data)
    # data.to_csv('./lyx_data.txt', sep=',', index=None)
    
    
  • 相关阅读:
    《代码大全2》阅读笔记02
    《代码大全2》阅读笔记01
    第二阶段冲刺第六天
    学习进度5
    构建之法阅读笔记03
    地铁进度记录
    学习进度4
    个人数组
    学习进度3
    构建之法阅读笔记02
  • 原文地址:https://www.cnblogs.com/douzujun/p/15928671.html
Copyright © 2020-2023  润新知