流程模板
-
定义问题
- 导入类库
- 导入数据集
-
用标准Python类库导入
from csv import reader import numpy as np filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data' with open(filename, 'rt') as raw_data: readers = reader(raw_data, delimiter=',') x = list(readers) data = np.array(x).astype('float') print(data.shape)
-
用NumPy导入数据
from numpy import loadtxt filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data' with open(filename, 'rt') as raw_data: data = loadtxt(raw_data, delimiter=',') print(data.shape)
-
采用Pandas导入
from pandas import read_csv filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data' names = ['name', 'landmass', 'zone', 'area', 'population', 'language', 'religion', 'bars', 'stripes','colours','red','green','blue','gold','white','black','orange','mainhue','circles','crosses','saltires','quarters','sunstars','crescent','triangle','icon','animate','text','topleft','botright'] data = read_csv(filename, names=names, delim_whitespace=False) print(data.shape)
-
-
理解数据
-
描述性统计 分析数据
```[python] # 简单地查看数据 print(data.head(10)) # 数据的维度 print(data.shape) # 数据的属性和类型 print(data.dtypes) # 描述性统计 set_option('display.width',100) #设置对齐宽度 set_option('precision',4) # 设置数据的精度 print(data.describe()) # 数据分组分布 print(data.groupby('class).size()) # 数据相关性 set_option('display.width',100) #设置对齐宽度 set_option('precision',2) # 设置数据的精度 print(data.corr(method='pearson')) # 计算数据的高斯偏离 print(data.skew()) ```
-
数据可视化 观察数据
import matplotlib.pyplot as plt # 直方图 data.hist() # 密度图 data.plot(kind='density',subplots=True,layout=(3,3),sharex=False) # 箱线图 data.plot(kind='box',subplots=True,layout=(3,3),sharex=False) # 相关矩阵图 correlations = data.corr() fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(correlations,vmin=-1, vmax=1) fig.colorbar(cax) ticks = np.arange(0,9,1) ax.set_xticks(ticks) ax.set_yticks(ticks) ax.set_xticklabels(names) ax.set_yticklabels(names) # 散点矩阵图 from pandas.plotting import scatter_matrix scatter_matrix(data) plt.show()
-
-
数据准备
-
数据清洗
通过删除重复数据、标记错误数值,甚至标记错误的输入数据来清洗数据
-
特征选择
移除多余的特征属性,增加新的特征属性# 将数据分为输入数据和输出结果 array = data.values x = array[:,0:8] y = array[:,8] # 单变量特征选定,通过卡方检验,通过统计样本的实际观测值与理论推断值之间的偏离程度(卡方值),进行判断的,卡方值越小,偏差越小,越趋于符合 from numpy import set_printoptions from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 test = SelectKBest(score_func=chi2,k=4) fit = test.fit(x,y) set_printoptions(precision=3) print(fit.scores_) features = fit.transform(x) print(features) # 通过递归特征消除来选定特征 from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression model = LogisticRegression() rfe = RFE(model,3) fit = rfe.fit(x,y) print('特征个数:',fit.n_features_) print('被选定的特征:',fit.support_) print('特征排名:',fit.ranking_) # 主要成分分析选定特征数据 from sklearn.decomposition import PCA pca = PCA(n_components=3) fit = pca.fit(x) print('解释方差:%s' % fit.explained_variance_ratio_) print(fit.components_) # 特征重要性 通过决策树计算特征的重要性 from sklearn.ensemble import ExtraTreeClassifier model = ExtraTreesClassifier() fit = model.fit(x,y) print(fit.feature_importances_)
-
数据转换
对数据尺度进行调整或者调整数据的分布,以便更好地展示问题from numpy import set_printoptions # 将数据分为输入数据和输出结果 array = data.values x = array[:,0:8] y = array[:,8] # 调整数据尺度 将数据的各个属性按照相同的尺度来度量数据,使用于梯度下降、回归、神经网络和K近邻等 from sklearn.preprocessing import MinMaxScaler transformer = MinMaxScaler(feature_range=(0,1)) newX = transform(x) # 正态化数据 输出结果以0为中位数,方差为1,作为高斯分布算法的输入,使用于线性回归、逻辑回归、线性判别分析等 from sklearn.preprocessing import StandardScaler transformer = StandardScaler().fit(x) newX = transformer.transform(x) # 标准化数据(归一元处理) 将每一行的数据的距离处理成1,适合处理稀疏矩阵,适用于 使用权重输入的神经网络和使用距离的K近邻算法 from sklearn.preprocessing import Normalizer transformer = Normalizer().fit(x) newX = transformer.transform(x) # 二值数据 将数据转化为为二值,大于阈值设置为1,小于阈值设置为0,在明确值或特征工程增加属性的时候使用 from sklearn.preprocessing import Binarizer transformer = Binarizer(threshold=0.0).fit(x) newX = transformer.transform(x) # 设定数据的打印格式,并输出结果 set_printoptions(precision=3) print(newX)
-
-
评估算法
-
分离数据集
from sklearn.linear_model import LogisticRegression # 分离数据集合评估数据集 from sklearn.model_selection import train_test_split test_size = 0.33 seed = 4 x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=test_size,random_state=seed) model = LogisticRegression() model.fit(x_train,y_train) result = model.score(x_test,y_test) print('算法的评估结果:%.3f%%' % (result * 100)) # K折交叉验证分离 将原始数据分为K组,将每个子集数据分别做一次验证集,其余K-1组子集数据作为训练集,这样会得到K个模型,利用这K个模型最终的验证集的分类准确率的平均数作为分类器的指标 from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) model = LogisticRegression() result = cross_val_score(model,x,y,cv=kfold) print('算法评估结果:%.3f%% (%.3f%%)' % (result.mean() * 100, result.std() * 100)) # 弃一交叉验证分离 每个样本单独作为验证集,其余的N-1个样本作为训练集,然后取N个模型最终验证集的分类准确率的平均数 # 和K折交叉验证相比而言,弃一交叉验证的优点:1. 每一回合中几乎所有的样本皆用于训练模型 2. 实验过程中没有随机因素会影响实验数据,实验过程是可以被复制的 from sklearn.model_selection import LeaveOneOut from sklearn.model_selection import cross_val_score loocv = LeaveOneOut() model = LogisticRegression() result = cross_val_score(model, x, y, cv=loocv) print('算法评估结果:%.3f%% (%.3f%%)' % (result.mean()*100, result.std()*100)) # 重复随机分离评估数据集与训练数据集 from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import cross_val_score n_splits = 10 test_size = 0.33 seed = 7 kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed) model = LogisticRegression() result = cross_val_score(model,x,y,cv=kfold) print('算法评估结果:%.3f%% (%.3f%%)' % (result.mean()*100,result.std()*100))
-
定义模型评估标准
-
分类算法矩阵
from sklearn.linear_model import LogisticRegression # 分类准确度 from skleran.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) model = LogisticRegression() result = cross_val_score(model,x,y,cv=kfold) print('算法评估结果准确度:%.3f (%.3f)' % (result.mean(), result.std())) # 对数损失函数 from skleran.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) model = LogisticRegression() scoring = 'neg_log_loss' result = cross_val_score(model,x,y,cv=kfold,scoring=scoring) print('Logloss %.3f (%.3f)' % (result.mean(),result.std())) # AUC图 from skleran.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) model = LogisticRegression() scoring = 'roc_auc' result = cross_val_score(model,x,y,cv=kfold,scoring=scoring) print('AUC %.3f (%.3f)' % (result.mean(), result.std())) # 混淆矩阵 from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix test_size = 0.33 seed = 4 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed) model = LogisticRegression() model.fit(x_train,y_train) predicted = model.predict(x_test) matrix = confusion_matrix(y_test,predicted) classes = ['0','1'] dataframe = pd.DataFrame(data=matrix, index=classes, columns =classes) print(dataframe) # 分类报告 # 精确率 计算所有被检索到的项目中的应该被检索到的项目所占的比例 # 召回率 计算所有检索到的项目中的应该被检索到的项目与所有应该被检索到的项目的比值 from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report test_size = 0.33 seed = 4 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=test_size,random_state=seed) model = LogisticRegression() model.fit(x_train,y_train) predicted = model.predict(x_test) report = classification_report(y_test,predicted) print(report)
-
回归算法矩阵
from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.linear_model import LinearRegression n_splits = 10 seed = 7 kfold = KFold(n_splits=n_splits, random_state=seed) model = LinearRegression() # 平均绝对误差 所有单个观测值与算术平均值的偏差的绝对值的平均值 scoring = 'neg_mean_absolute_error' # 均方误差 均方误差的算术平方根 scoring = 'neg_mean_squared_error' # 决定系数 反映因变量的全部变异能通过回归关系被自变量解释的比例 scoring = 'r2' result = cross_val_score(model,x,y,cv=kfold,scoring=scoring) print('%.3f (%.3f)' % (result.mean(), result.std()))
-
-
算法审查
-
审查分类算法
from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) # 线性算法 # 逻辑回归 通过拟合一个逻辑函数,来预测一个事件发生的概率,输出值为0~1,非常适合处理二分类问题 from sklearn.linear_model import LogisticRegression model = LogisticRegression() # 线性判别分析 将高维的模式样本投影到最佳鉴别矢量空间,以达到抽取分类信息和压缩特征空间维数的效果,投影后,模式在该空间中有最佳的可分离性。线性判别分析与主要成分分析一样,被广泛应用在数据降维中 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis model = LinearDiscriminantAnalysis() # 非线性算法 # K近邻算法 如果一个样本在特征空间中的k个最相似的样本中的大多数属于某一个类别,则该样本也属于这个类别。 from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier() # 贝叶斯分类器 通过某对象的先验概率,利用贝叶斯公式计算出其在所有类别上的后验概率,选择具有最大后验概率的类作为该对象所属的类 from sklearn.native_bayes import GaussianNB model = GaussianNB() # 分类与回归树 等价于递归二分每个特征,在输入空间划分为有限个单元并在这些单元上确定预测的概率分布 from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() # 支持向量机 可以分析数据、识别模式,用于分类和回归分析 from sklearn.svm import SVC model = SVC() result = cross_val_score(model,x,y,cv=kfold) print(result.mean())
-
审查回归算法
from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) # 线性算法 # 线性回归算法 利用数理统计中的回归分析,来确定两种或两种以上变量间相互依赖的定量关系的一种统计方法 from sklearn.linear_model import LinearRegression model = LinearRegression() # 岭回归算法 一种专门用于共线性数据分析的有偏估计回归方法(最小二乘法的改良版) from sklearn.linear_model import Ridge model = Ridge() # 套索回归算法 和岭回归算法类似,使用的惩罚函数是绝对值而不是平方 from sklearn.linear_model import Lasso model = Lasso() # 弹性网络回归算法 是套索回归算法和岭回归算法的混合体 当有多个相关的特征时 弹性网络回归算法是很有用的 from sklearn.linear_model import ElasticNet model = ElasticNet() # 非线性算法 # K近邻算法 按照距离来预测结果 from sklearn.neighbors import KNeighborsRegressor model = KNeighborsRegressor() # 分类与回归树 from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() # 支持向量机 from sklearn.svm import SVR model = SVR() scoring = 'neg_mean_squared_error' result = cross_val_score(model, x, y, cv=kfold, scoring=scoring) print('%.3f' % result.mean())
-
-
算法比较
from sklearn.model_selection import KFold from sklearn.linear_model import LogisticRegression from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import GaussianNB from matplotlib import pyplot num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) models = {} models['LR'] = LogisticRegression() models['LDA'] = LinearDiscriminantAnalysis() models['KNN'] = KNeighborsClassifier() models['CART'] = DecisionTreeClassifier() models['SVM'] = SVC() models['NB'] = GaussianNB() results = [] for name in models: result = cross_val_score(models[name], X, Y, cv=kfold) results.append(result) msg = '%s: %.3f (%.3f)' % (name, result.mean(), result.std()) print(msg) # 图表显示 fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(models.keys()) pyplot.show()
-
-
优化模型
-
算法调参
-
网格搜索优化参数
from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV # 算法实例化 model = Ridge() # 设置要遍历的参数 param_grid = {'alpha': [1, 0.1, 0.01, 0.001, 0]} # 通过网格搜索查询最优参数 grid = GridSearchCV(estimator=model, param_grid=param_grid) grid.fit(x, y) # 搜索结果 print('最高得分:%.3f' % grid.best_score_) print('最优参数:%s' % grid.best_estimator_.alpha)
-
随机搜索优化参数
from sklearn.linear_model import Ridge from sklearn.model_selection import RandomizedSearchCV from scipy.stats import uniform model = Ridge() # 设置要遍历的参数 param_grid = {'alpha': uniform()} # 通过网格搜索查询最优参数 grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7) grid.fit(x, y) # 搜索结果 print('最高得分:%.3f' % grid.best_score_) print('最优参数:%s' % grid.best_estimator_.alpha)
-
-
集成算法
from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score num_folds = 10 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) num_tree = 100 # 装袋算法 通过给定组合投票的方式获得最优解 # 装袋决策树 from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier cart = DecisionTreeClassifier() model = BaggingClassifier(base_estimator=cart, n_estimators=num_tree, random_state=seed) # 随机森林 用随机的方式建立一个森林,森林由很多的决策树组成,且每棵决策树之间是没有关联的 from sklearn.ensemble import RandomForestClassifier max_features = 3 model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features) # 极端随机树 和随机森林类似,区别如下: # 1. 随机森林应用的是Bagging模型,极端随机树的每棵决策树应用的是相同的全部训练样本 # 2. 随机森林是在一个随机子集内得到最优分叉特征属性,而极端随机树是完全随机地选择分叉特征属性从而实现对决策树进行分叉的 from sklearn.ensemble import ExtraTreesClassifier max_features = 7 model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features) # 提升算法 提高弱分类算法准确度的方法,也是一种提高任意给定学习算法准确度的方法 # AdaBoost 是一种迭代算法,针对同一个训练集训练不同的分类器(弱分类器),然后把这些弱分类器集合起来,构成一个更强的最终分类器(强分类器) from sklearn.ensemble import AdaBoostClassifier model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed) # 随机梯度提升 沿着函数的梯度方向找到某个函数的最大值。每次只用一个样本点来更新回归系数 from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=num_tree, random_state=seed) result = cross_val_score(model, x, y, cv=kfold) # 投票算法 通过创建两个或多个算法模型。利用投票算法将这些算法包装起来,计算各个子模型的平均预测状况 cart = DecisionTreeClassifier() models = [] model_logistic = LogisticRegression() models.append(('logistic', model_logistic)) model_cart = DecisionTreeClassifier() models.append(('cart', model_cart)) model_svc = SVC() models.append(('svm', model_svc)) ensemble_model = VotingClassifier(estimators=models) result = cross_val_score(ensemble_model, x, y, cv=kfold) print(result.mean())
-
-
结果部署
-
步骤
- 预测评估数据集
- 利用整个数据集生成模型
- 序列化模型
-
实现
from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression # 通过pickle 序列化和反序列化机器学习的模型 from pickle import dump from pickle import load # 通过joblib 序列化和反序列化机器学习的模型 from sklearn.externals.joblib import dump from sklearn.externals.joblib import load test_size = 0.33 seed = 4 x_train, x_test, y_traing, y_test = train_test_split(x, y, test_size=test_size, random_state=seed) model = LogisticRegression() model.fit(x_train, y_traing) model_file = 'finalized_model.sav' with open(model_file, 'wb') as model_f: dump(model, model_f) with open(model_file, 'rb') as model_f: loaded_model = load(model_f) result = loaded_model.score(x_test, y_test) print("算法评估结果:%.3f%%" % (result * 100))
-
整个流程不是线程的,而是循环进行的,要花费大量的时间来重复各个步骤,直到找到一个准确度足够的模型!!!
对于无监督的机器学习算法,因为不存在目标变量值,所以不需要训练算法。
注:本文根据《机器学习 Python实践》整理总结所得
如需转载请注明出处:https://www.cnblogs.com/zhuchenglin/p/10292007.html