参考李宏毅老师ML作业二:https://colab.research.google.com/drive/1JaMKJU7hvnDoUfZjvUKzm9u-JLeX6B2C#scrollTo=RJuoQ_R2jUmX
https://github.com/PoRuiHuang/ML2020SPRING
1 """ 2 2020/4/11 3 cfancy 4 logistic-regression 5 实做一个线性二元分类器,来根据人们的个人资料,判断其年收入 6 是否高于50000$ 7 所用数据是已经做了稍微的处理 8 X_train :训练集 9 Y_trian :目标集(训练标签集) 10 X_trian :测试集 11 手写 gradient descent 12 """ 13 # 导入库 14 import sys 15 import numpy as np 16 import pandas as pd 17 import math 18 import matplotlib.pyplot as plt 19 import csv 20 21 #常用的基础函数 22 #平均值 23 def mean(x): 24 return sum(x) / len(x) 25 26 #值与平均值之差 27 def de_mean(x): 28 x_bar = mean(x) 29 return [x_i - x_bar for x_i in x] 30 31 #矩阵点乘 32 def dot(v, w): 33 return sum(v_i * w_i for v_i, w_i in zip(v, w)) 34 #python 中 zip()函数:打包为元组的列表 35 36 # 37 def sum_of_squares(v): 38 """ 39 :param v: 40 :return: 41 """ 42 return dot(v, v) 43 44 # 方差 45 def variance(x): 46 """ 47 方差 48 :param x: 49 :return: 50 """ 51 n = len(x) 52 deviations = de_mean(x) 53 return sum_of_squares(deviations) / (n - 1) 54 #标准差 55 def standard_deviation(x): 56 """ 57 标准差 58 :param x: 59 :return: 60 """ 61 return math.sqrt(variance(x)) 62 63 #协方差 64 def covariance(x,y): 65 """ 66 协方差 67 :param x: 68 :param y: 69 :return: 70 """ 71 n = len(x) 72 return dot(de_mean(x), de_mean(y)) / (n - 1) 73 74 #相关系数 75 def correlation(x, y): 76 """ 77 相关系数 78 :return: 79 """ 80 stdev_x = standard_deviation(x) 81 stdev_y = standard_deviation(y) 82 if stdev_x > 0 and stdev_y > 0: 83 return covariance(x, y) / stdev_x / stdev_y 84 else: 85 return 0 86 87 # 一些有用的函数 88 # 打乱索引 89 def _shuffle(X, Y): 90 """ 91 将两个长度相等的列表或者数组打乱其索引 92 打乱训练集(即打乱索引) 93 :param X: 94 :param Y: 95 :return: 96 """ 97 randomize = np.arange(len(X)) 98 np.random.shuffle(randomize) 99 return (X[randomize], Y[randomize]) 100 # A = np.arange(10) 101 # B = np.arange(10) 102 # 103 # A ,B = _shuffle(A,B) 104 # print(A) #[3 5 2 9 6 7 0 8 4 1] 105 # 106 # print(B) #[3 5 2 9 6 7 0 8 4 1] 107 108 # 109 def _sigmoid(z): 110 """ 111 Sigmoid 函数用来计算概率 112 :param z: 113 :return: 114 """ 115 return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8)) #clip 限制数组中的值 116 117 # activate function 118 def _f(X, w, b): 119 """ 120 logistic_regression 函数 121 :param X: 输入数据 shape = [batch_size, data_dimension 122 :param w: 参数 权重 w 向量 shape = [data_dimension, ] 123 :param b: 参数 偏置 b 数值 124 :return: predicted probability of each row of X being positively labeled, shape = [batch_size, ] 125 """ 126 return _sigmoid(np.matmul(X, w) + b) 127 128 def _predict(X, w, b): 129 """ 130 结果通过四舍五入的 131 :param X: 132 :param w: 133 :param b: 134 :return:返回预测的值 135 """ 136 return np.round(_f(X, w,b)).astype(np.int) 137 138 def _accuracy(Y_pred, Y_label): 139 # This function calculates prediction accuracy 140 acc = 1 - np.mean(np.abs(Y_pred - Y_label)) 141 return acc 142 143 144 """--------------------准备数据--------------------""" 145 np.random.seed(0) ##随机数生成器 146 # 读取数据 X_train Y_train X_test 并对数据进行正规化(归一化)normalization 147 X_train_fpath = './data/X_train' #训练集 148 Y_train_fpath = './data/Y_train' #目标集(标签集) 149 X_test_fpath = './data/X_test' # 测试集 150 output_fpath = './output_{}.csv' # 预测保存 151 152 #解析csv文件转为numpy数组 153 drop_list = [] 154 with open(Y_train_fpath) as f: 155 # next(f) 156 # Y_train = np.array([line.strip(' ').split(',')[1] for line in f], dtype = float) 157 # print(Y_train.shape) 158 # print(type(Y_train)) 159 # print(Y_train) 160 161 data2 = pd.read_csv(f) 162 # print(data2.shape) 163 data2 = data2.iloc[:, 1:] 164 # data2.drop(columns=['migration code-change in msa','migration code-change in reg','migration code-move within reg','migration prev res in sunbelt','country of birth father','country of birth mother','country of birth self'],inplace=True) 165 166 # next(f) 167 Y_train = data2.to_numpy(dtype=float) 168 Y_train = Y_train.reshape((len(Y_train),)) 169 # Parse csv files to numpy array 170 with open(X_train_fpath) as f: 171 # next(f) 172 # X_train = np.array([line.strip(' ').split(',')[1:] for line in f], dtype = float) 173 # print(X_train.shape) 174 # print(X_train) 175 176 data1 = pd.read_csv(f) 177 # print(data1.shape) 178 179 data1 = data1.iloc[:, 1:] 180 # data1.drop(columns=['migration code-change in msa','migration code-change in reg','migration code-move within reg','migration prev res in sunbelt','country of birth father','country of birth mother','country of birth self'],inplace=True) 181 # data.drop(columns=['id', 'migration code-change in msa','migration code-change in reg','migration code-move within reg','migration prev res in sunbelt','country of birth father','country of birth mother','country of birth self'],inplace=True) 182 X_train = data1.to_numpy(dtype=float) 183 184 for i in range(len(X_train[0])): 185 corr = correlation(X_train[:, i], Y_train) 186 # print(i,corr,sep=": ") 187 if (abs(corr) < 0.01): 188 drop_list.append(i) 189 190 print(drop_list) 191 # print(len(drop_list)) 192 print(X_train.shape) 193 X_train = np.delete(X_train, drop_list, 1) 194 195 196 with open(X_test_fpath) as f: 197 # next(f) 198 # X_test = np.array([line.strip(' ').split(',')[1:] for line in f], dtype = float) 199 # print(X_test.shape) 200 # print(X_test) 201 202 data3 = pd.read_csv(f) 203 # print(data3.shape) 204 data3 = data3.iloc[:, 1:] 205 # data3.drop(columns=['migration code-change in msa','migration code-change in reg','migration code-move within reg','migration prev res in sunbelt','country of birth father','country of birth mother','country of birth self'],inplace=True) 206 X_test = data3.to_numpy(dtype=float) 207 X_test = np.delete(X_test, drop_list, 1) 208 print("------------------------") 209 print(X_train) 210 print("------------------------") 211 print(Y_train) 212 print("------------------------") 213 print(X_test) 214 215 #正规化(归一化) 216 def _normalize(X, train = True , specified_column = None, X_mean = None, X_std = None): 217 """ 218 此函数归一化X的特定列 219 :param X: 数据 220 :param train: train = true 表示 处理training data,False 表示 testing data 221 :param specified_column: 将被标准化的列的索引 None 表示所有列都被归一化 222 :param X_mean: 当train = False 时training data 的平均值 223 :param X_std:当train = False 时training data 的标准差 224 :return: 返回归一化后的数据,training data的平均值 ,training data 的标准差 225 处理测试数据时,将重用训练数据的均值和标准方差 226 """ 227 if specified_column == None: 228 specified_column = np.arange(X.shape[1]) 229 if train: 230 X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1) 231 X_std = np.std(X[:, specified_column], 0).reshape(1, -1) 232 233 X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8) 234 235 return X, X_mean, X_std 236 237 #将处理后的数据切分为训练集和发展集 238 def _train_dev_split(X, Y, dev_ratio = 0.25): 239 """ 240 归一化后的数据切分为训练集与发展集 241 :param X: 242 :param Y: 243 :param dev_ration: 244 :return: 245 """ 246 train_size = int(len(X) * (1 - dev_ratio)) 247 return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:] 248 249 # 归一化训练集和测试集 250 X_train, X_mean, X_std = _normalize(X_train, train=True) 251 X_test, _, _ = _normalize(X_test, train=False, specified_column=None, X_mean=X_mean, X_std=X_std) 252 253 #将数据分为训练集和发展集 254 dev_ratio = 0.1 255 X_train, Y_train, X_dev, Y_dev = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio) 256 257 train_size = X_train.shape[0] 258 dev_size = X_dev.shape[0] 259 test_size = X_test.shape[0] 260 data_dim = X_train.shape[1] 261 print('Size of training set: {}'.format(train_size)) 262 print('Size of development set: {}'.format(dev_size)) 263 print('Size of testing set: {}'.format(test_size)) 264 print('Dimension of data: {}'.format(data_dim)) 265 266 """-------------------梯度函数和损失函数-----------------------""" 267 def _cross_entropy_loss(y_pred, Y_label,): 268 """ 269 交叉熵损失函数 根据推导的公式 270 :param y_pred: 概率预测 浮点型向量 271 :param Y_label: 真实标签值 布尔值向量 272 :return: 返回的是数值 273 """ 274 cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred)) 275 return cross_entropy 276 277 def _gradient(X, Y_label, w, b): 278 """ 279 计算交叉熵的梯度 根据笔记推到的公式 w_grad 280 :param X: 281 :param Y_label: 282 :param w: 283 :param b: 284 :return: 285 """ 286 y_pred = _f(X, w, b) 287 pred_error = Y_label - y_pred 288 w_grad = -np.sum(pred_error * X.T, 1) 289 b_grad = -np.sum(pred_error) 290 return w_grad, b_grad 291 292 """-------------------Training--------------------------""" 293 """ 294 使用小批次梯度下降法来训练(MBGD——Mini-batch gradient descent),训练集被分为许多小批次, 295 我们分别计算其梯度以及损失,并根据该批次来更新模型的参数。当一次回圈(epoch)完成,也就是整个训练集的所有 296 小批次都被使用过一次后,我们将所有训练资料打散并且重新分成新的小的批次,进行下一个回圈,知道事先设定好的回圈数量为止。 297 """ 298 #初始化参数 w,b 299 w = np.zeros((data_dim,)) 300 b = np.zeros((1,)) 301 302 #训练时需要的参数 最大迭代次数、每个batch的大小以及学习率 303 max_iter = 40 304 batch_size = 10 305 lr = 0.1 306 307 #保存每次迭代时的损失和精度,以进行绘制 308 train_loss = [] 309 dev_loss = [] 310 train_acc = [] 311 dev_acc = [] 312 313 #计算参数更新的次数 314 count = 1 315 316 # 迭代训练 317 for epoch in range(max_iter): 318 #在每个回圈开始时的随机变动 319 X_train, Y_train = _shuffle(X_train, Y_train) 320 321 #小批次训练 322 for idx in range(int(np.floor(train_size / batch_size))): 323 X = X_train[idx * batch_size:(idx+1) * batch_size] 324 Y = Y_train[idx * batch_size:(idx+1) * batch_size] 325 326 #计算梯度 327 w_grad, b_grad = _gradient(X, Y, w, b) 328 329 #梯度下降参数更新 330 #学习率随时间衰减 331 w = w - lr/np.sqrt(count) * w_grad 332 b = b - lr/np.sqrt(count) * b_grad 333 334 count = count + 1 335 336 #计算训练集和发展集的损失和精度 337 y_train_pred = _f(X_train, w, b) 338 Y_train_pred = np.round(y_train_pred) #round ()四舍五入 339 train_acc.append(_accuracy(Y_train_pred, Y_train)) 340 train_loss.append(_cross_entropy_loss(y_train_pred, Y_train,) / train_size) 341 342 y_dev_pred = _f(X_dev, w, b) 343 Y_dev_pred = np.round(y_dev_pred) 344 dev_acc.append(_accuracy(Y_dev_pred, Y_dev)) 345 dev_loss.append(_cross_entropy_loss(y_dev_pred, Y_dev) / dev_size) 346 print('Training loss: {}'.format(train_loss[-1])) 347 print('Development loss: {}'.format(dev_loss[-1])) 348 print('Training accuracy: {}'.format(train_acc[-1])) 349 print('Development accuracy: {}'.format(dev_acc[-1])) 350 351 """------------------------画图----------------------------""" 352 # 损失曲线(Loss curve) 353 plt.plot(train_loss) 354 plt.plot(dev_loss) 355 plt.title('Loss') 356 plt.legend(['train', 'dev']) 357 plt.savefig('loss.png') 358 plt.show() 359 360 # 精度曲线(Accuracy curve) 361 plt.plot(train_acc) 362 plt.plot(dev_acc) 363 plt.title('Accuracy') 364 plt.legend(['train', 'dev']) 365 plt.savefig('acc.png') 366 plt.show() 367 368 369 """------------------------------保存-------------------------------""" 370 # 保存预测测试集标签 (Predict testing labels) 371 predictions = _predict(X_test, w, b) 372 with open(output_fpath.format('logistic'), 'w') as f: 373 f.write('id,label ') 374 for i, label in enumerate(predictions): 375 f.write('{},{} '.format(i, label)) 376 377 # 对测试集归一化处理 打印出最重要的权重w 378 ind = np.argsort(np.abs(w))[::-1] #从后往前 379 with open(X_test_fpath) as f: 380 content = f.readline().strip(' ').split(',') 381 features = np.array(content) 382 for i in ind[0:10]: 383 print('*************') 384 print(features[i], w[i])