下文仅给出模型的PyTorch实现,具体问题分析过程以及数据下载可以去作者的GitHub
逻辑回归介绍:可以看这里 主要是区分它和线性回归的差异
导入相关库:
import numpy import pandas as pd import matplotlib.pyplot as plt import torch import torch.nn as nn
载入数据:
df = pd.read_csv('./data/GiveMeSomeCredit/cs-training.csv') sampleEntry = pd.read_csv('./data/GiveMeSomeCredit/sampleEntry.csv')
训练数据是一个包含用户各种信息的Excel文件,这里仅列出几行
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
1 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
2 3 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0
3 4 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0
4 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
数据初步处理:
# 将缺失值补全
df.rename(columns={'Unnamed: 0':'Id', 'SeriousDlqin2yrs':'Default'}, inplace=True) df['MonthlyIncome'].fillna(df['MonthlyIncome'].mean(),inplace=True) df['NumberOfDependents'].fillna(test['NumberOfDependents'].mode()[0], inplace=True)
数据的归一化,并将读取的所有数据按照7:3分为训练数据和测试数据:
# 获取训练集和测试集 features=['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','MonthlyIncome','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'] dep=['Default'] allfeatures = ['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','MonthlyIncome','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents','Default'] alldata = df[allfeatures] # 数据归一化 means, stds = dict(), dict() for col in features: means[col] = alldata[col].mean() stds[col] = alldata[col].std() for col in features: alldata[col] = (alldata[col] - means[col]) / stds[col] x=alldata[features] y=alldata[dep] train_size = int(x.shape[0] * 0.7) test_size = x.shape[0] - train_size x_train = x[:train_size] y_train = y[:train_size] x_test = x[train_size - 1:-1] y_test = y[train_size - 1:-1] # 将数据转化为Tensor x_train = torch.tensor(x_train.values, dtype=torch.float) y_train = torch.tensor(y_train.values, dtype=torch.float) x_test = torch.tensor(x_test.values, dtype=torch.float) y_test = torch.tensor(y_test.values, dtype=torch.float)
具体模型,这里的输入维度14和输出维度2可以从上面提供的x,y数据看出:
class MulNet(nn.Module): def __init__(self):# 输入维度14 输出维度2 super(MulNet, self).__init__() self.l1 = nn.Linear(14, 140) self.l2 = nn.Linear(140, 140) self.l3 = nn.Linear(140, 2) self.sigmod = nn.Sigmoid() def forward(self, x): y = self.l1(x) y = self.l2(y) y = self.l3(y) return self.sigmod(y)
超参数设定:
model3 = MulNet() criterion = nn.MSELoss() optimizer = torch.optim.SGD(model3.parameters(), lr=0.01)
训练模型:
for t in range(500): y_pred = model3(x_train) loss = criterion(y_pred, y_train) if (t + 1) % 10 == 0: print(t, loss.item()) losses1.append(loss.item()) if torch.isnan(loss): break optimizer.zero_grad() loss.backward() optimizer.step()
9 0.26426613330841064 19 0.24677835404872894 29 0.23098362982273102 39 0.21666239202022552 49 0.20363378524780273 59 0.19174979627132416 69 0.18088917434215546 79 0.17095163464546204 89 0.16185317933559418 99 0.15352196991443634 109 0.14589543640613556 119 0.1389177143573761 129 0.13253812491893768 139 0.12670989334583282 149 0.12138950824737549 159 0.11653617769479752 169 0.11211156845092773 179 0.1080796867609024 189 0.10440683364868164 199 0.10106151551008224 209 0.0980144515633583 219 0.09523852169513702 229 0.0927087813615799 239 0.09040225297212601 249 0.08829796314239502 259 0.08637678623199463 269 0.08462131768465042 279 0.0830157995223999 289 0.08154597878456116 299 0.08019895851612091 309 0.07896309345960617 319 0.07782793790102005 329 0.07678405195474625 339 0.07582291960716248 349 0.07493693381547928 359 0.0741191878914833 369 0.07336351275444031 379 0.07266434282064438 389 0.072016641497612 399 0.07141589373350143 409 0.07085802406072617 419 0.07033936679363251 429 0.06985656172037125 439 0.06940664350986481 449 0.06898687034845352 459 0.0685947984457016 469 0.06822817772626877 479 0.06788500398397446 489 0.06756342947483063 499 0.06726177781820297
训练结果:
preds = model3(x_test) print(preds.shape) _, ys = torch.max(preds, 1) print(ys.shape) right = 0.0 for i in range(len(ys)): if ys[i] == y_test[i]: right += 1 acc = right / len(ys) print(acc) # 之前试过学习率设为0.1时 正确率只有0.45
torch.Size([45000, 2]) torch.Size([45000]) 0.8359111111111112