• 机器学习入门之房价预测(线性回归)


    #!/usr/bin/env python
    # coding: utf-8
    
    # In[1]:
    
    
    # 1.定义问题
    
    # 2.导入数据
    
    # 导入类库
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split, KFold, cross_val_score
    from sklearn.preprocessing import StandardScaler
    get_ipython().run_line_magic('matplotlib', 'inline')
    
    import warnings
    warnings.filterwarnings('ignore')
    
    # 显示所有列
    pd.set_option('display.max_columns', None)
    
    # 导入数据
    train_data = pd.read_csv('../data/train.csv')
    test_data = pd.read_csv('../data/test.csv')
    
    
    # In[2]:
    
    
    # 3.理解数据
    # 数据信息
    train_data.info()
    
    
    # In[3]:
    
    
    # 数据维度
    train_data.shape
    
    
    # In[4]:
    
    
    # 前5个数据
    train_data.head(5)
    
    
    # In[5]:
    
    
    # 描述性统计数据
    train_data.describe().T
    
    
    # In[6]:
    
    
    # 4.数据可视化
    
    # 分析SalePrice
    train_data['SalePrice'].describe()
    sns.distplot(train_data['SalePrice'])
    plt.show()
    
    
    # In[7]:
    
    
    # 关系矩阵
    corr = train_data.corr()
    f, ax = plt.subplots(figsize=(20, 9))
    sns.heatmap(corr, vmax=1, vmin=-1,square=True)
    
    
    # In[8]:
    
    
    # 选取特征
    # 选择相关系数绝对值大于0.5的特征(共十个)
    train_data.corr()[train_data.corr()['SalePrice'].values > abs(0.5)]
    
    
    # In[9]:
    
    
    cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'SalePrice']
    train_data = train_data[cols]
    train_data.info()
    
    
    # In[10]:
    
    
    # 5.建立模型
    
    # 分离数据集
    X = train_data.values[:, 0:10]
    Y = train_data.values[:, 10]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    # 建模
    model = LinearRegression()
    # 预测数据
    model.fit(X_train,Y_train)
    y_pred = model.predict(X_test)
    print('cost:'+ str(np.sum(abs(y_pred-Y_test)/len(y_pred))))
    
    
    # In[11]:
    
    
    # 由于原始数据所得cost太大,所以接下来对数据进行归一化处理(误差反而更大了,不知道为什么???)
    X_scaled = StandardScaler().fit_transform(X)
    Y_scaled = StandardScaler().fit_transform(Y.reshape(-1, 1))
    X_scaled_train, X_scaled_test, Y_scaled_train, Y_scaled_test = train_test_split(X_scaled, Y_scaled, test_size=0.33, random_state=42)
    model_scaled = LinearRegression()
    model_scaled.fit(X_scaled_train,Y_scaled_train)
    y_pred = model.predict(X_scaled_test)
    y_pred
    print('cost:'+ str(np.sum(abs(y_pred-Y_scaled_test)/len(y_pred))))
    
    
    # In[12]:
    
    
    test_data['SalePrice'] = None
    test_data = test_data[cols]
    # 填充缺失值
    test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median(), inplace=True)
    test_data['GarageCars'].fillna(test_data['GarageCars'].median(), inplace=True)
    test_data['GarageArea'].fillna(test_data['GarageArea'].median(), inplace=True)
    
    
    # In[13]:
    
    
    X = test_data.values[:, 0:10]
    y_test_pre = model.predict(X)
    test_data['SalePrice'] = y_test_pre
    test_data.head(10)
  • 相关阅读:
    python3----练习题(斐波那契)
    python3----运算符
    python3----函数、匿名函数
    python3----生成器generator(yield)
    Python捕获异常
    OS模块
    发送邮件
    IO文件读写
    Unittest框架概念
    生成报告
  • 原文地址:https://www.cnblogs.com/coco-shi/p/9820658.html
Copyright © 2020-2023  润新知