在开始说之前一个很重要的Tip:电脑至少要求是64位的,这是我的痛。
断断续续花了个把月的时间把这本书过了一遍。这是一本非常适合基于python入门的机器学习入门的书籍,全书通俗易懂且有代码提供。书中源代码连接为Ipython环境。主页君使用的是pycharm,python2.7,具体安转过程书本写的很详细。码完书中代码,有一点点点小不符(或许可能是因为平台不一样),百度基本可以解决问题(有问题也可以留言探讨)。贴一点代码,以示学习:
1_4_7.py:
#coding=utf-8 # Filename : 良性、恶性乳腺癌肿瘤预测 完整代码样例(线性分类器) #导入pandas工具包 import pandas as pd #调用pandas工具包的read_csv函数模块,传入训练文件地址参数,获得返回的数据并且存入变量df_train df_train=pd.read_csv('breast-cancer-train.csv') df_test=pd.read_csv('breast-cancer-test.csv') #选取clumpthickness与cellsize作为特征,构建测试集中的正负分类样本 df_test_negative=df_test.loc[df_test['Type']==0][['Clump Thickness', 'Cell Size']] df_test_positive=df_test.loc[df_test['Type']==1][['Clump Thickness', 'Cell Size']] import matplotlib.pyplot as plt #绘制良性肿瘤样本点标记为红的o plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black') #绘制x,y轴 plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.title('1-2') #显示图 #plt.show() #导入 import numpy as np #利用random函数随机采样直线的系数与截距 intercept=np.random.random([1]) coef=np.random.random([2]) print coef,intercept lx=np.arange(0, 12)#创建等差数组 ly=(-intercept-lx*coef[0])/coef[1]#截距式 plt.plot(lx,ly,c='yellow')#绘随机直线 plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black') #绘制x,y轴 plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.title('1-3') #显示图 plt.show() #导入sklearn的逻辑斯蒂回归分类器 from sklearn.linear_model import LinearRegression lr=LinearRegression() lr.fit(df_train[['Clump Thickness','Cell Size']][0:10],df_train['Type'][0:10]) print 'Testing accuracy (10 training sample):',lr.score(df_test[['Clump Thickness' , 'Cell Size']],df_test['Type']) print "你好,中国" #第二次 intercept=lr.intercept_ coef=lr.coef_[:2] print coef,intercept ly=(-intercept-lx*coef[0])/coef[1] plt.plot(lx,ly,c='green') plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black') #绘制x,y轴 plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.title('1-4') #显示图 plt.show() lr=LinearRegression() lr.fit(df_train[['Clump Thickness' , 'Cell Size']][:10],df_train['Type'][:10]) print 'Testing accuracy (all training sample):',lr.score(df_test[['Clump Thickness' , 'Cell Size']],df_test['Type']) #第三次 intercept=lr.intercept_ coef=lr.coef_[:2] ly=(-intercept-lx*coef[0])/coef[1] plt.plot(lx,ly,c='blue') plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black') #绘制x,y轴 plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.title('1-5') #显示图 plt.show() print 'end'
2_1_2_1.py:
# coding=utf-8# __author__ = 'lenovo' #线性回归器预测美国波士顿地区房价 #从包中导入房价数据 from sklearn.datasets import load_boston boston=load_boston() print boston.DESCR #导入数据分割器 from sklearn.cross_validation import train_test_split import numpy as np #导入有价值数据 x=boston.data y=boston.target #print x,y x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=33) #分析回归目标值的差异 print "The max target value is",np.max(boston.target) print "The min target value is",np.min(boston.target) print "The average target value is",np.mean(boston.target) #有输出结果看目标值差异较大,需进行标准化处理 from sklearn.preprocessing import StandardScaler #初始化特征和目标值的标准化器 ss_x=StandardScaler() ss_y=StandardScaler() #对训练数据和测试数据标准化 x_train=ss_x.fit_transform(x_train)#训练算法,设置内部参数,数据转换 x_test=ss_x.transform(x_test)#数据转换 y_train=ss_y.fit_transform(y_train) y_test=ss_y.transform(y_test) #使用LR与SGDRegression对房价进行预测 from sklearn.linear_model import LinearRegression lr=LinearRegression() #使用训练数据进行参数估计 lr.fit(x_train,y_train) #对测试数据进行回归预测 lr_y_predition=lr.predict(x_test) from sklearn.linear_model import SGDRegressor sgdr=SGDRegressor() sgdr.fit(x_train,y_train) sgdr_y_predict=sgdr.predict(x_test) #三种回归评价机制以及两种调用R-squared评价模块的方法,对本节模型的回归性能做出评价 print 'The avlue of default measurement of LinearRegression is',lr.score(x_test,y_test) #从sklearn.metrics依次导入r2_score,mean_squared_error,mean_absolute_error from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error #使用r2_score模块,并输出评估结果 print 'The value of R-Squared of LinearRegression is',r2_score(y_test,lr_y_predition) #使用mean_squared_error模块,并输出评估结果 print 'The mean squared error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predition)) #使用mean_absolute_error模块,并输出评估结果 print 'The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predition)) #使用SGDRegressor模块自带的评估模块,并输出评估模块 print 'The value of default measurement of SGDRegressor is',sgdr.score(x_test,y_test) #使用r2_score模块,并输出评估结果 print 'The value of R-Squared of SGDRegressor is',r2_score(y_test,sgdr_y_predict) #使用mean_squared_error模块,并输出评估结果 print 'The mean squared error of SGDRegressor is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)) #使用mean_absolute_error模块,并输出评估结果 print 'The mean absoluate error of SGDRegressor is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict))