1.1 输入信息
输入分为三部分:
l train_data.txt为已经做好特征工程处理的本地训练集文件。每一行为一条数据记录,以逗号分开。最后一列为类别(二分类),前面的列为特征值。
l test_data.txt 为需要预测的本地测试集文件。特征数和训练集一致。不含类别信息。
l 示例代码为准确率和性能待优化的参考代码,支持的语言分别为C++/Python/JAVA。
answer.txt为test_data.txt的二分类结果,用于练习的时候使用。
1.1 输出信息
输出信息为一个文件result.txt,按行顺序放置测试集记录的预测结果,每一行代表一条训练数据的二分类结果。
1.2 限制条件
l 选手拿到的训练集和测试集并不是最终判题用的数据。
l 示例代码的算法实现为LR(逻辑回归),选手可以将其改为其它的机器学习算法,但程序中定义的输入输出文件路径不能改。
l 不允许使用外部机器学习库。
示例代码(LR,逻辑回归)
1 import math 2 import datetime 3 import sys 4 import numpy as np 5 6 7 class LR: 8 def __init__(self, train_file_name, test_file_name, predict_result_file_name): 9 self.train_file = train_file_name 10 self.predict_file = test_file_name 11 self.predict_result_file = predict_result_file_name 12 self.max_iters = 760 13 self.rate = 0.1 14 self.feats = [] 15 self.labels = [] 16 self.feats_test = [] 17 self.labels_predict = [] 18 self.param_num = 0 19 self.weight = [] 20 21 def loadDataSet(self, file_name, label_existed_flag): 22 feats = [] 23 labels = [] 24 fr = open(file_name) 25 lines = fr.readlines() 26 for line in lines: 27 temp = [] 28 allInfo = line.strip().split(',') 29 dims = len(allInfo) 30 if label_existed_flag == 1: 31 for index in range(dims-1): 32 temp.append(float(allInfo[index])) 33 feats.append(temp) 34 labels.append(float(allInfo[dims-1])) 35 else: 36 for index in range(dims): 37 temp.append(float(allInfo[index])) 38 feats.append(temp) 39 fr.close() 40 feats = np.array(feats) 41 labels = np.array(labels) 42 return feats, labels 43 44 def loadTrainData(self): 45 self.feats, self.labels = self.loadDataSet(self.train_file, 1) 46 47 def loadTestData(self): 48 self.feats_test, self.labels_predict = self.loadDataSet( 49 self.predict_file, 0) 50 51 def savePredictResult(self): 52 print(self.labels_predict) 53 f = open(self.predict_result_file, 'w') 54 for i in range(len(self.labels_predict)): 55 f.write(str(self.labels_predict[i])+" ") 56 f.close() 57 58 def sigmod(self, x): 59 return 1/(1+np.exp(-x)) 60 61 def printInfo(self): 62 print(self.train_file) 63 print(self.predict_file) 64 print(self.predict_result_file) 65 print(self.feats) 66 print(self.labels) 67 print(self.feats_test) 68 print(self.labels_predict) 69 70 def initParams(self): 71 self.weight = np.ones((self.param_num,), dtype=np.float) 72 73 def compute(self, recNum, param_num, feats, w): 74 return self.sigmod(np.dot(feats, w)) 75 76 def error_rate(self, recNum, label, preval): 77 return np.power(label - preval, 2).sum() 78 79 def predict(self): 80 self.loadTestData() 81 preval = self.compute(len(self.feats_test), 82 self.param_num, self.feats_test, self.weight) 83 self.labels_predict = (preval+0.5).astype(np.int) 84 self.savePredictResult() 85 86 def train(self): 87 self.loadTrainData() 88 recNum = len(self.feats) 89 self.param_num = len(self.feats[0]) 90 #print(self.param_num) 91 self.initParams() 92 ISOTIMEFORMAT = '%Y-%m-%d %H:%M:%S,f' 93 for i in range(self.max_iters): 94 preval = self.compute(recNum, self.param_num, 95 self.feats, self.weight) 96 sum_err = self.error_rate(recNum, self.labels, preval) 97 if i%30 == 0: 98 print("Iters:" + str(i) + " error:" + str(sum_err)) 99 theTime = datetime.datetime.now().strftime(ISOTIMEFORMAT) 100 print(theTime) 101 err = self.labels - preval 102 delt_w = np.dot(self.feats.T, err) 103 delt_w /= recNum 104 self.weight += self.rate*delt_w 105 106 107 def print_help_and_exit(): 108 print("usage:python3 main.py train_data.txt test_data.txt predict.txt [debug]") 109 sys.exit(-1) 110 111 112 def parse_args(): 113 debug = False 114 if len(sys.argv) == 2: 115 if sys.argv[1] == 'debug': 116 print("test mode") 117 debug = True 118 else: 119 print_help_and_exit() 120 return debug 121 122 123 if __name__ == "__main__": 124 #debug = parse_args() 125 train_file = "./data/train_data.txt" 126 test_file = "./data/test_data.txt" 127 predict_file = "./data/result.txt" 128 lr = LR(train_file, test_file, predict_file) 129 lr.train() 130 lr.predict() 131 debug=True 132 133 if debug: 134 answer_file ="./data/answer.txt" 135 f_a = open(answer_file, 'r') 136 f_p = open(predict_file, 'r') 137 a = [] 138 p = [] 139 lines = f_a.readlines() 140 for line in lines: 141 a.append(int(float(line.strip()))) 142 f_a.close() 143 144 lines = f_p.readlines() 145 for line in lines: 146 p.append(int(float(line.strip()))) 147 f_p.close() 148 149 print("answer lines:%d" % (len(a))) 150 print("predict lines:%d" % (len(p))) 151 152 errline = 0 153 for i in range(len(a)): 154 if a[i] != p[i]: 155 errline += 1 156 157 accuracy = (len(a)-errline)/len(a) 158 print("accuracy:%f" %(accuracy))