数据的质量和数据中包含的有用信息的数量直接影响机器学习算法最终的学习结构,因此,数据集进行学习算法之前,对数据进行检验及预处理至关重要。
主要内容:
- 数据集中缺失数据的删除和填充
- 数据格式化
- 模型构建中的特征选择
数据预处理
import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0
'''
df = pd.read_csv(StringIO(csv_data)) #read_csv函数将CSV格式的数据读取到pandas的数据框DataFrame中
#StringIO函数做演示作用。如果数据来自于硬盘上的CSV文件,通过此函数以字符串的方式从文件中读取数据,并将其转换成DataFrame的格式赋值给csv_data
print (df)
#统计缺失值的数量 sum()
#isnull()返回一个布尔型的DataFrame值 DataFrame元素单元中包含数字型数值则返回False,数据值缺失则返回True,sum()得到每列中缺失值的数量
print (df.isnull().sum())
print (df.values) #通过DataFrame的value属性访问相关的NumPy数组
print (df.dropna()) #dropna()删除包含缺失值的行
print (df.dropna(axis = 1)) #axis=1 删除数据集中至少包含一个NaN值的列
print (df.dropna(how='all'))
print (df.dropna(thresh=4))
print (df.dropna(subset=['C']))
#均值插补 即使用相应的特征均值来替换缺失值 使用scikit-learn中的Impute类
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
print (imputed_data)
将数据集划分为训练数据集合测试数据集 本例数据集包含178个葡萄酒样本,每个样本通过13个特征
对其化学特征进行描述。
import pandas as pd
df_wine = pd.read_csv('D:Pythondatawine.data', header=None) #本地硬盘读取数据集
#df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) #在线读取开源的葡萄酒数据集
df_wine.columns = ['Class label', 'Alcohol',
'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium',
'Total phenols', 'Flavanoids',
'Nonflavanoid phenols',
'Proanthocyanins',
'Color intensity', 'Hue',
'OD280/0D315 of diluted wines',
'Proline'
]
print ('Class labels', np.unique(df_wine['Class label']))
print (df_wine.head())
数据集随机划分为测试数据集合训练数据集 使用scikit-learn下cross_validation子模块中的train_test_split函数:
from sklearn.cross_validation import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print (X_train, X_test, y_train, y_test)
数据集随机划分为测试数据集合训练数据集 使用scikit-learn下cross_validation子模块中的train_test_split函数:
from sklearn.cross_validation import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print (X_train, X_test, y_train, y_test)
特征缩放是数据预处理过程中至关重要的一步 特征值缩放到相同的区间可以使其性能更佳
#scikit-learn 最小-最大缩放
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)
print (X_train_norm, X_test_norm)
#scikit-learn 标准化类
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
print (X_train_std, X_test_std)
减少过拟合问题:正则化、特征选择降维,scikit-learn 中支持L1的正则化模型 将penalty参数设定为'l1'进行简单的数据稀疏处理
from sklearn.linear_model import LogisticRegression
LogisticRegression(penalty='l1')
lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)
print ('Training accuracy:', lr.score(X_train_std, y_train))
print ('Test accuracy:', lr.score(X_test_std, y_test)) #训练和测试精确度大于98% 显示此模型未出现过拟合。
Training accuracy: 0.9838709677419355
Test accuracy: 0.9814814814814815
print (lr.intercept_) #lr.intercept_ 得到截距项后,返回三个数值的数组
[-0.38374769 -0.158147 -0.70042382]
print (lr.coef_) #lr.coef_ 得到的权重数组包含三个权重系数向量
绘制正则化效果图 展示将权重系数(正则化参数)应用于多个特征上时所产生的不同的正则化效果
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.subplot(111)
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'pink', 'lightgreen', 'lightblue', 'gray', 'indigo', 'orange']
weights, params = [], []
for c in np.arange(-4, 6, dtype=float):
lr = LogisticRegression(penalty='l1',
C=10**c,
random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)
weights = np.array(weights)
for column, color in zip(range(weights.shape[1]), colors):
plt.plot(params, weights[:, column],
label=df_wine.columns[column+1],
color=color)
plt.axhline(0, color='black', linestyle='--', linewidth=3)
plt.xlim([10**(-5), 10**5])
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(loc='upper center',
bbox_to_anchor=(1.38, 1.03),
ncol=1, fancybox=True)
plt.show()
图示显示,在强的正则化参数(C<0.1)作用下,罚项使得所有的特征权重都趋近于0,在这里,C是正则化参数的倒数
序列特征选择算法
#SBS算法
from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
class SBS():
def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state
def fit(self, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
self.scores_ = [score]
while dim > self.k_features:
scores = []
subsets = []
for p in combinations(self.indices_, r=dim-1):
score = self._calc_score(X_train, y_train, X_test, y_test, p)
scores.append(score)
subsets.append(p)
best = np.argmax(scores)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1
self.scores_.append(scores[best])
self.k_score_ = self.scores_[-1]
return self
def transform(self, X):
return X[:, self.indices_]
def _calc_score(self, X_train, y_train, X_test, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train)
y_pred = self.estimator.predict(X_test[:, indices])
score = self.scoring(y_test, y_pred)
return score
SBS应用于scikit-learn中KNN分类器的效果
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)
绘制KNN分类器的分类准确率 准确率数值是在验证数据集上计算得出的
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.show()
随机森林特征选择,在葡萄酒训练集上训练10000棵树,分别根据其重要程度对13个特征给出重要性等级,注意基于树的模型不需要做标准化或归一化处理
from sklearn.ensemble import RandomForestClassifier
feat_labels = df_wine.columns[1:]
forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f+1, 30, feat_labels[f], importances[indices[f]]))
1) Alcohol 0.182483
2) Malic acid 0.158610
3) Ash 0.150948
4) Alcalinity of ash 0.131987
5) Magnesium 0.106589
6) Total phenols 0.078243
7) Flavanoids 0.060718
8) Nonflavanoid phenols 0.032033
9) Proanthocyanins 0.025400
10) Color intensity 0.022351
11) Hue 0.022078
12) OD280/0D315 of diluted wines 0.014645
13) Proline 0.013916
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center')
plt.xticks(range(X_train.shape[1]), feat_labels, rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()
比较粗糙 持续完善中。。。
欢迎大家提供宝贵建议
博客以学习、分享为主!