基本概念
1. 与简单线性回归区别(simple linear regression)
多个自变量(x)
2. 多元回归模型
y=β0+β1x1+β2x2+ ... +βpxp+ε
其中:β0,β1,β2... βp是参数
ε是误差值
3. 多元回归方程
E(y)=β0+β1x1+β2x2+ ... +βpxp
4. 估计多元回归方程:
y_hat=b0+b1x1+b2x2+ ... +bpxp
一个样本被用来计算β0,β1,β2... βp的点估计b0, b1, b2,..., bp
5. 估计流程 (与简单线性回归类似)
6. 估计方法
使sum of squares最小
运算与简单线性回归类似,涉及到线性代数和矩阵代数的运算
推导过程
第一项的 (X^T)X是一个对称阵,也可以看出第一项是一个标量,因为X是一个m*(1+n)的矩阵,θ是一个(1+n)*1的矩阵,
根据矩阵相乘,可以得到第一项是一个标量
此处用到了向量的求导,性质如下:
第二项同理根据矩阵相乘,也是一个标量
向量的求导,性质如下:
第三项同理根据矩阵相乘,也是一个标量
向量的求导,性质如下:
代码实现
In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
In [12]:
boston = datasets.load_boston()
X = boston.data
y = boston.target
In [13]:
X.shape
Out[13]:
In [14]:
X = X[y < 50.0]
y = y[y < 50.0]
In [15]:
X.shape
Out[15]:
In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [57]:
from ml09linearRegression2 import LinearRegression
reg = LinearRegression()
In [58]:
reg.fit_normal(X_train, y_train)
Out[58]:
In [59]:
reg.coef_
Out[59]:
In [60]:
reg.intercept_
Out[60]:
In [61]:
reg.score(X_test, y_test)
Out[61]:
import numpy as np from ml09metrics import r2_score class LinearRegression: def __init__(self): """初始化Linear Regression模型""" self.coef_ = None self.intercept_ = None self._theta = None def fit_normal(self, X_train, y_train): """根据训练数据集X_train, y_train训练Linear Regression模型""" assert X_train.shape[0] == y_train.shape[0], "the size of X_train must be equal to the size of y_train" X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train) self.intercept_ = self._theta[0] self.coef_ = self._theta[1:] return self def predict(self, X_predict): """给定待预测数据集X_predict,返回表示X_predict的结果向量""" assert self.intercept_ is not None and self.coef_ is not None, "must fit before predict!" assert X_predict.shape[1] == len(self.coef_), "the feature number of X_predict must be equal to X_train" X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) return X_b.dot(self._theta) def score(self, X_test, y_test): """根据测试数据集 X_test 和 y_test 确定当前模型的准确度""" y_predict = self.predict(X_test) return r2_score(y_test, y_predict) def __repr__(self): return "LinearRegression()"
import numpy as np from math import sqrt def accuracy_score(y_true, y_predict): """计算y_true和y_predict之间的准确率""" assert len(y_true) == len(y_predict), "the size of y_true must be equal to the size of y_predict" return np.sum(y_true == y_predict) / len(y_true) def mean_squared_error(y_true, y_predict): """计算y_true和y_predict之间的MSE""" assert len(y_true) == len(y_predict), "the size of y_true must be equal to the size of y_predict" return np.sum((y_true - y_predict) ** 2) / len(y_true) def root_mean_squared_error(y_true, y_predict): """计算y_true和y_predict之间的RMSE""" return sqrt(mean_squared_error(y_true, y_predict)) def mean_absolute_error(y_true, y_predict): """计算y_true和y_predict之间的RMSE""" assert len(y_true) == len(y_predict), "the size of y_true must be equal to the size of y_predict" return np.sum(np.absolute(y_true - y_predict)) / len(y_true) def r2_score(y_true, y_predict): """计算y_true和y_predict之间的R Square""" return 1 - mean_squared_error(y_true, y_predict) / np.var(y_true)
scikit-learn实现多元线性回归
In [32]:
from sklearn import datasets
In [33]:
boston = datasets.load_boston()
X = boston.data
y = boston.target
In [34]:
X = X[y < 50.0]
y = y[y < 50.0]
In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
In [36]:
from sklearn.linear_model import LinearRegression
In [37]:
lin_reg = LinearRegression()
In [38]:
lin_reg.fit(X_train, y_train)
Out[38]:
In [39]:
lin_reg.coef_
Out[39]:
In [40]:
lin_reg.intercept_
Out[40]:
In [41]:
lin_reg.score(X_test, y_test)
Out[41]:
In [42]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor()
In [44]:
knn_reg.fit(X_train, y_train)
Out[44]:
In [45]:
knn_reg.score(X_test, y_test)
Out[45]:
In [46]:
from sklearn.model_selection import GridSearchCV
In [50]:
para_grid = [
{"weights": ["uniform"],
"n_neighbors": [i for i in range(1, 11)]
},
{"weights": ["distance"],
"n_neighbors": [i for i in range(1, 11)],
"p": [i for i in range(1, 6)]
}
]
In [51]:
knn_reg = KNeighborsRegressor()
# n_jobs是使用多少个cpu,-1表示所有
grid_search = GridSearchCV(knn_reg, para_grid, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
Out[51]:
In [52]:
grid_search.best_params_
Out[52]:
In [53]:
grid_search.best_score_ # 此处的scope使用了交叉验证
Out[53]:
In [54]:
grid_search.best_estimator_.score(X_test, y_test)
# 这个与线性回归的得分标准一样
Out[54]:
In [55]:
import numpy as np
np.argsort(lin_reg.coef_)
Out[55]:
In [58]:
boston.feature_names
Out[58]:
In [59]:
boston.feature_names[np.argsort(lin_reg.coef_)]
Out[59]:
In [60]:
print(boston.DESCR)