- XGBoost自动读取数据,判断蘑菇是否有毒 二分类
# /usr/bin/python # -*- encoding:utf-8 -*- # 判断蘑菇是否有毒二分类 import xgboost as xgb import numpy as np # 1、xgBoost的基本使用 # 2、自定义损失函数的梯度和二阶导 # 3、binary:logistic/logitraw # 定义f: theta * x def log_reg(y_hat, y): p = 1.0 / (1.0 + np.exp(- y_hat)) g = p - y.get_label() h = p * (1.0-p) return g, h #错误率 def error_rate(y_hat, y): return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat) if __name__ == "__main__": # 读取数据 data_train = xgb.DMatrix('12.agaricus_train.txt') data_test = xgb.DMatrix('12.agaricus_test.txt') # 设置参数 #'max_depth': 2 每一棵树的最大深度为2 #'eta': 1 衰减因子 # 'silent': 1 输出生成树的过程 #'objective': 'binary:logitraw' 二分类 param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logitraw'} # logitraw #data_test:测试数据 data_train:训练数据 watchlist = [(data_test, 'eval'), (data_train, 'train')] #迭代三轮 得到3棵树 n_round = 3 #训练 bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist) #自定义损失函数 # obj=log_reg 目标函数为log_reg # bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate) # 计算错误率 y_hat = bst.predict(data_test) y = data_test.get_label() # print(y_hat) # print(y) error = sum(y != (y_hat > 0)) error_rate = float(error) / len(y_hat) print('样本总数: ', len(y_hat)) print('错误数目: %4d' % error) print('错误率: %.5f%%' % (100*error_rate))
- 判断蘑菇是否有毒 手动读取数据
# /usr/bin/python # -*- coding:utf-8 -*- import xgboost as xgb import numpy as np import scipy.sparse from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression #手动读取数据 def read_data(path): y = [] #标签值 row = [] #存储相应的行 col = [] #存储相应的列 values = [] #存储相应的值,row,col,values的值一一对应 r = 0 # 首行 for d in open(path): # 以空格分开 d = d.strip().split() #第0列给y y.append(int(d[0])) #第一列后面的数都给d d = d[1:] #遍历每一个d for c in d: #以':'进行拆分,前面的是key,后面的是value key, value = c.split(':') #对应的第几行放入 row中 row.append(r) #列中加入相应的key col.append(int(key)) #添加相应的值 values.append(float(value)) #一行处理完r加1 r += 1 #创建系数矩阵,(row,col)的位置赋值成相应的值 x = scipy.sparse.csr_matrix((values, (row, col))).toarray() y = np.array(y) return x, y def show_accuracy(a, b, tip): acc = a.ravel() == b.ravel() print(acc) print(tip + '正确率: ', float(acc.sum()) / a.size) if __name__ == '__main__': #x的每一行为特征 #y为标签值 x, y = read_data('12.agaricus_train.txt') #划分训练数据和测试数据 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6) # Logistic回归 lr = LogisticRegression(penalty='l2') lr.fit(x_train, y_train.ravel()) y_hat = lr.predict(x_test) show_accuracy(y_hat, y_test, 'Logistic回归 ') # XGBoost # 把标记为3的都设置为0,因为XGBoost分类是从0开始的 y_train[y_train == 3] = 0 y_test[y_test == 3] = 0 # 对测试数据和训练数据进行包装 data_train = xgb.DMatrix(x_train, label=y_train) data_test = xgb.DMatrix(x_test, label=y_test) # 指定训练数据和测试数据 watch_list = [(data_test, 'eval'), (data_train, 'train')] # 给定参数 param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3} # 训练 bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list) # 预测 y_hat = bst.predict(data_test) # 输出正确率 show_accuracy(y_hat, y_test, 'XGBoost ')
-
鸢尾花数据判断 多分类
# /usr/bin/python # -*- encoding:utf-8 -*- #鸢尾花数据判断 多分类 import xgboost as xgb import numpy as np from sklearn.model_selection import train_test_split # cross_validation def iris_type(s): it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2} return it[s] if __name__ == "__main__": # 数据文件路径 path = u'.\8.iris.data' #载入数据 data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) #x为前4列,y为4列之后 x, y = np.split(data, (4,), axis=1) #一部分当做训练,一部分当做测试 #test_size=50 测试数据取了50个 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50) #训练数据和标记值组装给DMatrix data_train = xgb.DMatrix(x_train, label=y_train) # 测试数据和标记值组装给DMatrix data_test = xgb.DMatrix(x_test, label=y_test) #明确测试数据和训练数据 watch_list = [(data_test, 'eval'), (data_train, 'train')] #每一棵树最大深度为3 # 'objective': 'multi:softmax' 多分类 param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3} #训练五轮 bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list) y_hat = bst.predict(data_test) result = y_test.reshape(1, -1) == y_hat print('正确率: ', float(np.sum(result)) / len(y_hat)) print('END..... ')
-
#葡萄酒的分类问题
# /usr/bin/python # -*- encoding:utf-8 -*- #葡萄酒的分类问题 import xgboost as xgb import numpy as np from sklearn.model_selection import train_test_split # cross_validation from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler def show_accuracy(a, b, tip): acc = a.ravel() == b.ravel() # print(acc) print(tip + '正确率: ', float(acc.sum()) / a.size) if __name__ == "__main__": #载入数据 data = np.loadtxt('12.wine.data', dtype=float, delimiter=',') #第一列是标记数据,后面的是特征数据 y, x = np.split(data, (1,), axis=1) #划分训练数据和测试数据 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5) # Logistic回归 lr = LogisticRegression(penalty='l2') lr.fit(x_train, y_train.ravel()) y_hat = lr.predict(x_test) show_accuracy(y_hat, y_test, 'Logistic回归 ') # XGBoost #把标记为3的都设置为0,因为XGBoost分类是从0开始的 y_train[y_train == 3] = 0 y_test[y_test == 3] = 0 #对测试数据和训练数据进行包装 data_train = xgb.DMatrix(x_train, label=y_train) data_test = xgb.DMatrix(x_test, label=y_test) #指定训练数据和测试数据 watch_list = [(data_test, 'eval'), (data_train, 'train')] #给定参数 param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3} #训练 bst = xgb.train(param, data_train, num_boost_round=4, evals= watch_list) # 预测 y_hat = bst.predict(data_test) # 输出正确率 show_accuracy(y_hat, y_test, ' XGBoost ' )
- 泰坦尼克号问题
# /usr/bin/python # -*- encoding:utf-8 -*- # 泰坦尼克号 import xgboost as xgb import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestClassifier import pandas as pd import csv def show_accuracy(a, b, tip): acc = a.ravel() == b.ravel() acc_rate = 100 * float(acc.sum()) / a.size # print '%s正确率:%.3f%%' % (tip, acc_rate) return acc_rate def load_data(file_name, is_train): # 使用pandas来读取数据 # csv文件是带文件头的 data = pd.read_csv(file_name) # 数据文件路径 # 输出统计的信息,包括均值,最大值,最小值等 # print(data.describe()) # 性别 # pandas的一个好处是可以直接通过类别来索引到相应的列 # 如果是female则变成0,male则变成1,做这样一个字典映射 data[ ' Sex ' ] = data[ ' Sex ' ].map({ ' female ' : 0, ' male ' : 1 }).astype(int) # 补齐船票价格缺失值 # data.Fare直接得到Fare的那一列 if len(data.Fare[data.Fare.isnull()]) > 0: fare = np.zeros(3 ) # 取出等级是f的所有行,取出'Fare'列, # 把空白的给去掉,然后求剩下的中位数 for f in range(0, 3 ): fare[f] = data[data.Pclass == f + 1][ ' Fare ' ].dropna().median() # 填充相应等级的人的船票 for f in range(0, 3 ): data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), ' Fare ' ] = fare[f] # 年龄:使用均值代替缺失值 # .dropna()去掉为空的行 # mean_age = data['Age'].dropna().mean() # data.loc[(data.Age.isnull()), 'Age'] = mean_age # 随机森林对年龄进行预测 if is_train: # 年龄:使用随机森林预测年龄缺失值 print ( ' 随机森林预测缺失年龄:--start-- ' ) # 取出相应特征的列 data_for_age = data[[ ' Age ' , ' Survived ' , ' Fare ' , ' Parch ' , ' SibSp ' , ' Pclass ' ]] # 年龄不缺失的数据部分提取出来 age_exist = data_for_age.loc[(data.Age.notnull())] print( " age_exist: " ,age_exist) # 年龄为空的数据部分提取出来,要估计的部分 age_null = data_for_age.loc[(data.Age.isnull())] # x为所有行的第1列以后,包括第一列 x = age_exist.values [:, 1 :] # y为第0列 y = age_exist.values[:, 0] # 随机森林预测 rfr = RandomForestRegressor(n_estimators=1000 ) # 对模型进行训练 rfr.fit(x, y) # 对数据进行预测 age_hat = rfr.predict(age_null.values[:, 1 :]) # 把预测的数据填充到为空的那些行中 data.loc[(data.Age.isnull()), ' Age ' ] =age_hat print ( ' 随机森林预测缺失年龄:--over-- ' ) # 如果是测试数据,则没有Survived这一项, # 所以前面加一个is_train用来判段是测试数据还是训练数据 else : print ( ' 随机森林预测缺失年龄2:--start-- ' ) data_for_age = data[[ ' Age ' , ' Fare ' , ' Parch ' , ' SibSp ' , ' Pclass ' ]] age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据 age_null = data_for_age.loc[(data.Age.isnull())] # print age_exist x = age_exist.values[:, 1 :] y = age_exist.values[:, 0] rfr = RandomForestRegressor(n_estimators=1000 ) rfr.fit(x, y) age_hat = rfr.predict(age_null.values[:, 1 :]) # print age_hat data.loc[(data.Age.isnull()), ' Age ' ] = age_hat print ( ' 随机森林预测缺失年龄2:- -over-- ' ) # 对起始城市进行计算 # 把出发乘客最多的城市赋值给城市为空的 data.loc[(data.Embarked.isnull()), ' Embarked ' ] = ' S ' # 取出Embarked这一列的数据 embarked_data = pd.get_dummies(data.Embarked) # 把所有出发城市拿出来,前面加上前缀,形成三个特征 # 使用lambda表达式,所有可能的值取出,形成一行,以(0,1,0) # 的形式表示 embarked_data = embarked_data.rename (columns = lambda x: ' Embarked_ ' + str(x)) # 数据和这个新的特征组合在一起,形成新的数据 data = pd.concat([data, embarked_data], axis=1 ) # print(data .describe()) # data.to_csv('New_Data.csv') # 把清洗后的数据提取出来作为x x = data[[ ' Pclass ' , ' Sex ' , ' Age ' , ' SibSp ' , ' Parch ' , ' Fare ' , ' Embarked_C ' , ' Embarked_Q ' , ' Embarked_S ' ]] y = None # 如果是训练集,提取y if ' Survived ' in data: y = data[ ' Survived ' ] # 转成对应的矩阵 x = np.array(x) y = np.array(y)
y = y.reshape(-1,1)
# 平铺五行,让测试数据变得更多 x = np.tile(x, (5, 1 ) ) y = np.tile(y, (5, 1 ) ) if is_train: return x, y return x, data[ ' PassengerId ' ] def write_result(c, c_type): file_name = ' 12.Titanic.test.csv ' x, passenger_id = load_data(file_name, False) if type == 3 : x = xgb.DMatrix(x) y = c.predict(x) y[y > 0.5] = 1 y[ ~(y > 0.5)] = 0 predictions_file = open( " Prediction_%d.csv " % c_type, " wb " ) open_file_object = csv.writer(predictions_file) open_file_object.writerow([ " PassengerId " , " Survived " ]) open_file_object.writerows(zip(passenger_id, y)) predictions_file.close() if __name__ == " __main__ " : # 载入数据 x, y = load_data( ' 12.Titanic.train.csv ' , True) # 分成训练数据和测试数据 x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.5, random_state=1 ) # logistic回归 lr = LogisticRegression(penalty= ' l2 ' ) lr.fit(x_train, y_train) y_hat = lr.predict(x_test) lr_rate = show_accuracy(y_hat, y_test, ' Logistic回归' ) # 随机森林,100棵树 rfc = RandomForestClassifier(n_estimators=100 ) rfc.fit(x_train, y_train) y_hat = rfc.predict(x_test) rfc_rate = show_accuracy(y_hat, y_test, ' 随机森林' ) # XGBoost # 训练数据和测试数据 data_train = xgb.DMatrix(x_train, label= y_train) data_test = xgb.DMatrix(x_test, label= y_test) # 指明那个是训练数据,哪个是测试数据 watch_list = [(data_test, ' eval ' ), (data_train, ' train ' )] # 训练参数二分类 param = { ' max_depth ' : 3, ' eta ' : 0.1, ' silent ' : 1, ' objective ' : ' binary:logistic ' } # 进行训练 bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list) # 进行预测 y_hat = bst.predict(data_test) # 把大于0.5的设置成1,小于0.5的设置为0 y_hat[y_hat > 0.5] = 1 y_hat[ ~(y_hat > 0.5)] = 0 xgb_rate = show_accuracy(y_hat, y_test, ' XGBoost ' ) print ( ' Logistic回归:%.3f%% ' % lr_rate) print ( ' 随机森林:%.3f%% ' % rfc_rate) print ( ' XGBoost:%.3f%% ' % xgb_rate)
- 泰坦尼克号问题