import numpy as np
import pandas as pd
#预处理数据sklearn-preprocessing
#创建一组特征数据,每一行表示一个样本,每一列表示一个特征
dataset = pd.read_csv('data.csv')
X = dataset.iloc[ : , :-1].values #自变量
Y = dataset.iloc[ : , 3].values #因变量
#矩阵向量
#print(X)
# 1. 标准化:去均值,方差规模化
#将特征数据的分布调整成标准正太分布,高斯分布,也就是使得数据的均值维0,方差为1.
# print("数据导入")
# print("X")
# print(X)
# print("Y")
# print(Y)
#解析分类数据
from sklearn.preprocessing import Imputer
#axis=1表示对每一行去做这个操作,axis=0表示对每一列做相同的这个操作
#标准化后的数据的均值与方差,已经变成0,1了
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
imputer = imputer.fit(X[ : , 1:3])
X[ : , 1:3] = imputer.transform(X[ : , 1:3])
# print("X")
# print(X)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])
# print(X)
#标准化 方差规模化
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
# print(Y)
#拆分数据集合为测试数据和训练数据
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)
#一个训练数据和测试数据 自变量
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)