^(* ̄(oo) ̄)^:1.有部分代码我进行了数据归一化操作(也叫数据标准化) 在评估的时候使用的inverse_transform函数把数据还原
2.code的代码是按书中的顺序 先进行了 数据抽样 (split) 然后进行了归一化操作(StandardScaler)
先进行数据抽样会是数据的比例发生改变 再进行归一化操作这样是不当的
正常情况应该先进行归一化操作然后再进行数据抽样
PartOne经典分类 模型(做选择题:比如:判断是A类还是B类)
使用线性分类模型从事良/恶性肿瘤预测任务(LogisticRegression和SGDClassifiler)
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.linear_model import stochastic_gradient from sklearn.metrics import classification_report column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class'] data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names=column_names) print(data.isnull) data = data.replace(to_replace='?', value=np.nan) data = data.dropna(how='any') print(data.shape) x_train, x_test, y_train, y_test = train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.25, random_state=33) print(y_train.value_counts()) print(y_test.value_counts()) ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.fit_transform(x_test) lr = LogisticRegression() lr.fit( x_train ,y_train) lr_y_predict = lr.predict(x_test) print('Accuracy of LR ClassifierL:', lr.score(x_test, y_test)) print(classification_report(y_test, lr_y_predict,target_names=['Benign', 'Malignant'])) sgdc = stochastic_gradient.SGDClassifier() sgdc.fit( x_train ,y_train) sgdc_y_predict = sgdc.predict(x_test) print('Accuracy of SGD ClassifierL:', sgdc.score(x_test, y_test)) print(classification_report(y_test, sgdc_y_predict,target_names=['Benign', 'Malignant']))
对手写数码图像识别(分类)模型(支持向量机)
from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC from sklearn.metrics import classification_report digits = load_digits() print( digits.data.shape) x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33) print(y_train.shape) print(y_test.shape) ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.fit_transform(x_test) lsvc=LinearSVC() lsvc.fit(x_train, y_train) y_predict=lsvc.predict(x_test) print('Accuracy of Liner SVC is:', lsvc.score(x_test, y_test)) print(classification_report(y_test, y_predict,target_names=digits.target_names.astype(str)))
新闻文本分类(朴素贝叶斯)
from sklearn.datasets import fetch_20newsgroups from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report news=fetch_20newsgroups(subset='all') print(len(news.data)) print(news.data[0]) x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33) vec=CountVectorizer() x_train=vec.fit_transform(x_train) x_test=vec.transform(x_test) mnb=MultinomialNB() mnb.fit(x_train,y_train) y_predict=mnb.predict(x_test) print('Accuracy of Naive Bayes Classifier is:', mnb.score(x_test, y_test)) print(classification_report(y_test, y_predict,target_names=news.target_names))
对鸢尾花(lris)数据进行类别预测(K近邻分类)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report iris=load_iris() print(iris.data.shape) print(iris.DESCR) x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=33) ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.fit_transform(x_test) knc=KNeighborsClassifier() knc.fit(x_train, y_train) y_predict=knc.predict(x_test) print('Accuracy of K-nearest Neighbour Classifier is:', knc.score(x_test, y_test)) print(classification_report(y_test, y_predict,target_names=iris.target_names))
对泰坦尼克号乘客的生还情况预测(决策树)
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction import DictVectorizer from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') titanic.head() titanic.info() x=titanic[['pclass','age','sex']] y=titanic['survived'] x.info() x['age'].fillna(x['age'].mean(),inplace=True) x.info() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) vec=DictVectorizer(sparse=False) x_train=vec.fit_transform(x_train.to_dict(orient='record')) print(vec.feature_names_) x_test=vec.fit_transform(x_test.to_dict(orient='record')) dtc=DecisionTreeClassifier() dtc.fit(x_train, y_train) y_predict=dtc.predict(x_test) print(dtc.score(x_test,y_test)) print(classification_report(y_test, y_predict,target_names = ['died', 'survived']))
对泰坦尼克号乘客的生还情况预测(集成模型(分类):随机森林分类器和梯度提升决策树)
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction import DictVectorizer from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import classification_report titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') x=titanic[['pclass','age','sex']] y=titanic['survived'] x['age'].fillna(x['age'].mean(),inplace=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) vec=DictVectorizer(sparse=False) x_train=vec.fit_transform(x_train.to_dict(orient='record')) x_test=vec.fit_transform(x_test.to_dict(orient='record')) dtc=DecisionTreeClassifier() dtc.fit(x_train, y_train) dtc_y_predict=dtc.predict(x_test) rfc=RandomForestClassifier() rfc.fit(x_train,y_train) rfc_y_predict=rfc.predict(x_test) gbc=GradientBoostingClassifier() gbc.fit(x_train,y_train) gbc_y_predict=gbc.predict(x_test) print('Accuracy of decision tree is:', dtc.score(x_test, y_test)) print(classification_report(dtc_y_predict,y_test)) print('Accuracy of random forest classifier is:', rfc.score(x_test, y_test)) print(classification_report(rfc_y_predict,y_test)) print('Accuracy of gradient tree classifier is:', gbc.score(x_test, y_test)) print(classification_report(gbc_y_predict,y_test))
PartTwo经典回归模型(做计算题:比如:计算某个问题的数值)
使用线性回归器对房屋价格进行预测LinearRegression和Stochastic_Gradient
代码后面的inverse_transform函数的作用是把归一化的数据还原
中间要对标签集进行reshape
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.linear_model import stochastic_gradient from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error import numpy as np boston =load_boston() print (boston.DESCR)#查看数据描述 x=boston.data y=boston.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)#分割数据 # 注:应该先进性归一化然后再进行样本抽样 此代码中顺序是相反的(根据书上) print("The max target value is",np.max(boston.target)) print("The min target value is",np.min(boston.target)) print("The average target value is ",np.mean(boston.target))#输出标签集的最大值最小值 平均值 #预测目标之间相差较大 进行标准化处理 ss_X = StandardScaler().fit(x) ss_y = StandardScaler().fit(y) #ss_X = StandardScaler() #ss_y = StandardScaler() x_train = ss_X.fit_transform(x_train) x_test = ss_X.transform(x_test) y_train = ss_y.fit_transform(y_train.reshape(-1,1)) y_test = ss_y.transform(y_test.reshape(-1,1)) #y_train = ss_y.fit_transform(y_train) #y_test = ss_y.fit_transform(y_test) #使用线性回归器对房价进行预测() lr = LinearRegression() lr.fit( x_train ,y_train) lr_y_predict = lr.predict(x_test) sgdr = stochastic_gradient.SGDRegressor() sgdr.fit( x_train ,y_train) sgdr_y_predict = sgdr.predict(x_test) print('The value of default measurement of LinearRegression is',lr.score(x_test,y_test))#线性回归模型自带的评估模块 print('The value of R-squred of LinearRegression is',r2_score(y_test,lr_y_predict))#回归问题的评价指标 print('The mean squred error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))#平放误差 print('The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))#绝对平放误差 print('The value of default measurement of Regressor is',sgdr.score(x_test,y_test))#线性回归模型自带的评估模块 print('The value of R-squred of is',r2_score(y_test,sgdr_y_predict))#回归问题的评价指标 print('The mean squred error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))#平放误差 print('The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))#绝对平放误差
以三种不同的核函数来使用支持向量机模型对房屋价格进行预测
^(* ̄(oo) ̄)^:我第一遍打代码的时候 没有使用对数据进行归一化操作
其中核函数linear 没有受到影响
但是使用和函数poly训练没有归一化的数据会卡住
核函数rbf的预测准确程度会大幅度下降
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.svm import SVR from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error boston = load_boston() x = boston.data y = boston.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) ss_x = StandardScaler() ss_y = StandardScaler() x_train = ss_x.fit_transform(x_train) x_test = ss_x.transform(x_test) y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) y_test = ss_y.transform(y_test.reshape(-1, 1)) linear_svr=SVR(kernel='linear') linear_svr.fit(x_train,y_train) linear_svr_y_predict=linear_svr.predict(x_test) poly_svr = SVR(kernel='poly') poly_svr.fit(x_train, y_train.ravel()) poly_svr_y_predict = poly_svr.predict(x_test) rbf_svr=SVR(kernel='rbf') rbf_svr.fit(x_train,y_train) rbf_svr_y_predict=rbf_svr.predict(x_test) print('I AM Linear_SVR') print('score',linear_svr.score(x_test,y_test)) print('R-squared',r2_score(y_test,linear_svr_y_predict)) print('mean squared',mean_squared_error(y_test,linear_svr_y_predict)) print('mean absolute',mean_absolute_error(y_test,linear_svr_y_predict)) print('I AM POLY_SVR') print('score',poly_svr.score(x_test,y_test)) print('R-squared',r2_score(y_test,poly_svr_y_predict)) print('mean squared',mean_squared_error(y_test,poly_svr_y_predict)) print('mean absolute',mean_absolute_error(y_test,poly_svr_y_predict)) print('I AM RBF_SVR') print('score',rbf_svr.score(x_test,y_test)) print('R-squared',r2_score(y_test,rbf_svr_y_predict)) print('mean squared',mean_squared_error(y_test,rbf_svr_y_predict)) print('mean absolute',mean_absolute_error(y_test,rbf_svr_y_predict))
使用两种不同配置的k近邻回归模型对房间进行预测(普通算数平均算法和加权平均)
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler #________________________________________________________________________________________ from sklearn.neighbors import KNeighborsRegressor#k近邻回归器 from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error boston = load_boston() x = boston.data y = boston.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) ss_x = StandardScaler() ss_y = StandardScaler() x_train = ss_x.fit_transform(x_train) x_test = ss_x.transform(x_test) y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) y_test = ss_y.transform(y_test.reshape(-1, 1)) #________________________________________________________________________________________ uni_knr=KNeighborsRegressor(weights='uniform')#初始化K近邻回归器 并设置预测方式为 平均回归 uni_knr.fit(x_train,y_train) uni_knr_y_predict=uni_knr.predict(x_test) dis_knr=KNeighborsRegressor(weights='distance')#初始化K近邻回归器 并设置预测方式为 加权回归 dis_knr.fit(x_train,y_train) dis_knr_knr_y_predict=dis_knr.predict(x_test) print('I AM uni_knr') print('score',uni_knr.score(x_test,y_test)) print('R-squared',r2_score(y_test,uni_knr_y_predict)) print('mean squared',mean_squared_error(y_test,uni_knr_y_predict)) print('mean absolute',mean_absolute_error(y_test,uni_knr_y_predict)) print('I AM dis_knr') print('score',dis_knr.score(x_test,y_test)) print('R-squared',r2_score(y_test,dis_knr_knr_y_predict)) print('mean squared',mean_squared_error(y_test,dis_knr_knr_y_predict)) print('mean absolute',mean_absolute_error(y_test,dis_knr_knr_y_predict))
使用单一回归树对房价进行预测
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler #_________________________________________________________ from sklearn.tree import DecisionTreeRegressor#导入回归树模型 from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error boston = load_boston() x = boston.data y = boston.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) ss_x = StandardScaler() ss_y = StandardScaler() x_train = ss_x.fit_transform(x_train) x_test = ss_x.transform(x_test) y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) y_test = ss_y.transform(y_test.reshape(-1, 1)) #____________________________________________________________ dtr=DecisionTreeRegressor() dtr.fit(x_train,y_train) dtr_y_predict=dtr.predict(x_test) print('I AM DecisionTreeRegressor') print('score',dtr.score(x_test,y_test)) print('R-squared',r2_score(y_test,dtr_y_predict)) print('mean squared',mean_squared_error(y_test,dtr_y_predict)) print('mean absolute',mean_absolute_error(y_test,dtr_y_predict))
使用三种集成回归模型对房价进行预测(RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor)
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler #_______________________________________________________________________________________ from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error boston = load_boston() x = boston.data y = boston.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) ss_x = StandardScaler() ss_y = StandardScaler() x_train = ss_x.fit_transform(x_train) x_test = ss_x.transform(x_test) y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) y_test = ss_y.transform(y_test.reshape(-1, 1)) rfr=RandomForestRegressor() rfr.fit(x_train,y_train) rfr_y_predict=rfr.predict(x_test) etr=ExtraTreesRegressor() etr.fit(x_train,y_train) etr_y_predict=etr.predict(x_test) gbr=GradientBoostingRegressor() gbr.fit(x_train,y_train) gbr_y_predict=gbr.predict(x_test) print('I AM RandomForestRegressor') print('score',rfr.score(x_test,y_test)) print('R-squared',r2_score(y_test,rfr_y_predict)) print('mean squared',mean_squared_error(y_test,rfr_y_predict)) print('mean absolute',mean_absolute_error(y_test,rfr_y_predict)) print('I AM ExtraTreesRegressor') print('score',etr.score(x_test,y_test)) print('R-squared',r2_score(y_test,etr_y_predict)) print('mean squared',mean_squared_error(y_test,etr_y_predict)) print('mean absolute',mean_absolute_error(y_test,etr_y_predict)) print('I AM GradientBoostingRegressor') print('score',gbr.score(x_test,y_test)) print('R-squared',r2_score(y_test,gbr_y_predict)) print('mean squared',mean_squared_error(y_test,gbr_y_predict)) print('mean absolute',mean_absolute_error(y_test,gbr_y_predict))