听说github上有人号召100天内通过项目入门机器学习,觉得是一个特别好的想法。我也一直筹谋着下一份工作一定要是机器学习相关,从web到AI的转型可能是我至今为止面临的最大的难题。但是我不怕,俗话说的好:竹杖芒鞋轻胜马,谁怕,一蓑烟雨任平生。
第一天: 数据预处理
# 1. importing the required libraries.
import numpy as np
import pandas as pd
# 2. importing the dataset
dataset = pd.read_csv("../data/Data.csv")
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values
print(X)
print(Y)
# 3. handing the missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
print(X)
# 4. Encoding Categorical Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelecoder_X = LabelEncoder()
X[: , 0] = labelecoder_X.fit_transform(X[:, 0])
print(X)
# 4.1 creating a dummy variable
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
print(X)
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
print(Y)
# 5. Splitting the dataset into test set and training set.
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print(X_train)
print(X_test)
print(Y_train)
print(Y_test)
# 6. Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
print(X_train)
print(X_test)
Data.csv
Country,Age,Salary,Purchased
France,44,72000,No
Spain,27,48000,Yes
Germany,30,54000,No
Spain,38,61000,No
Germany,40,,Yes
France,35,58000,Yes
Spain,,52000,No
France,48,79000,Yes
Germany,50,83000,No
France,37,67000,Yes