基础概念:https://zhuanlan.zhihu.com/p/344754828
import sys
import pandas as pd
import numpy as np
import math
# all_list = []
# df = pd.DataFrame(columns = ['date', 'data'])
# counter = 0
# for line in sys.stdin:
# line = line.strip('\r\n')
# df.loc[counter] = line.split(',')
# counter += 1
# df.to_excel('./output2.xlsx', sheet_name='Sheet1')
def calc_psi(dataframe):
date_col = dataframe['date']
data_col = dataframe['data']
# print(dataframe['data'].max(), dataframe['data'].min())
# 分组
result = pd.qcut(dataframe['data'], 10)
# print(result)
dataframe["group"] = result
group = result.drop_duplicates(inplace=False).sort_values(ascending=True)
group.index = range(1, len(group) + 1)
print(group)
print('group 1:', group[1])
# 日期
from datetime import datetime
months = date_col.apply(lambda x: str(datetime.strptime(str(x), '%Y%m%d').date().year)+str(datetime.strptime(str(x), '%Y%m%d').date().month))
months_drop = months.drop_duplicates(inplace=False)
print(months_drop.values)
dataframe['new_date'] = months
print(dataframe)
# 先取月份
# print((months == months_drop.values[1]) and (dataframe['group'] == group[1]))
# 取分组
# print(dataframe['group'])
# print('group 11:', group[1])
################ 9月数据在第一组的数量/9月总数 ###########
# print(dataframe['group'] == (group[1]))
# print((months == months_drop.values[1]))
# print('ggg:', (months == months_drop.values[1]) & (dataframe['group'] == group[1]))
actual_i = []
psi_indexs = []
for month_idx in range(0, 7):
group_i = []
for group_index in range(1, 11):
dec_group_i = dataframe[(months == months_drop.values[month_idx]) & (dataframe['group'] == group[group_index])]
# print(dec_group_i.size, dataframe[months == months_drop.values[month_idx]].size)
group_i.append(dec_group_i.size / dataframe[months == months_drop.values[month_idx]].size)
print('group_i:', group_i)
# 计算index = (实际占比 - 预期占比)* ln(实际占比 / 预期占比)
actual_i.append(group_i)
if month_idx > 0:
actual_ = np.array(actual_i[month_idx])
except_ = np.array(actual_i[month_idx - 1])
psi_index = (actual_ - except_) * np.log(actual_ / except_)
psi_indexs.append(psi_index)
print('psi_index:', psi_index)
print(actual_i)
# print(psi_indexs)
final_index = [np.sum(psi_index) for psi_index in psi_indexs]
print('final_index: ', final_index)
# data_202109 = dataframe[(months == months_drop.values[1]) & (dataframe['group'] == group[1])]
# print(data_202109.size, dataframe[(months == months_drop.values[1])].size)
# group2 = dataframe[(months == months_drop.values[1]) & (dataframe['group'] == group[2])]
# print(group2.size)
# date_drop = data.drop_duplicates(subset=['date', 'date'], keep='first', inplace=False)
data = pd.read_csv('./lyx_data.txt', sep=',', names=['date', 'data'])
# print(data.head())
calc_psi(data)
# 生成数据
# import numpy as np
# date = ['20210812', '20210922', '20211009', '20211102', '20211202', '20220112', '20220202']
# new_date = []
# for i in range(100):
# new_date.extend(date)
# new_data = []
# for i in range(700):
# new_data.append(np.random.random(1))
# data = pd.DataFrame(columns=['date', 'data'])
# data['date'] = np.array(new_date)
# data['data'] = np.array(new_data)
# data.to_csv('./lyx_data.txt', sep=',', index=None)