• 回归问题示例


    所用数据可从这里下载(提取码1fl1),数据说明可参考此文件(提取码7tvm),目标是分析建筑物的节能之星评分(ENERGY STAR Score)与哪些因素有关,并对之进行预测。

    一个完整的机器学习项目主要有以下几个步骤组成:

    • 探索性数据分析(EDA)
    • 特征工程和选择(Feature Engineering and Selection)
    • 机器学习模型比较(Model Comparison)
    • 超参数调优(Hyperparameters Tuning)
    • 模型评估和解释(Model Evaluation and Interpretation)

    1. 探索性数据分析

    • Read data and Confirm data type
      import pandas as pd
      import numpy as np
      import seaborn as sns
      import matplotlib.pyplot as plt
      from IPython.core.pylabtools import figsize
      from sklearn.model_selection import train_test_split
      from sklearn.preprocessing import Imputer, FunctionTransformer, MinMaxScaler, LabelBinarizer
      from sklearn_pandas import DataFrameMapper, CategoricalImputer
      from sklearn.pipeline import Pipeline
      data = pd.read_csv('Energy_and_Water_Data_Disclosure_for_Local_Law_84_2017__Data_for_Calendar_Year_2016.csv')
      data = data.replace({'Not Available': np.nan})
      numeric_units = ['ft²','kBtu','Metric Tons CO2e','kWh','therms','gal','Score']
      for col in list(data.columns):
          for unit in numeric_units:
              # Select columns that should be numeric
              if unit in col:
                  # Convert the data type to float
                  data[col] = data[col].astype(float)
      View Code
    • Check missing value
      def missing_values_table(df):
          # Total missing values
          mis_val = df.isnull().sum()    
          # Percentage of missing values
          mis_val_percent = 100 * df.isnull().sum() / len(df)        
          # Make a table with the results
          mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)        
          # Rename the columns
          mis_val_table = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
          # Sort the table by percentage of missing descending
          mis_val_table = mis_val_table[mis_val_table.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)    
          # Return the dataframe with missing information
          return mis_val_table   
      missing_df = missing_values_table(data)
      ### drop the columns that have >50% missing values
      missing_columns = list(missing_df[missing_df['% of Total Values'] > 50].index)
      print('We will remove %d columns.' % len(missing_columns))
      data = data.drop(columns = missing_columns)
      View Code
    • Remove Outliers
      # Calculate first and third quartile
      first_quartile = data['Site EUI (kBtu/ft²)'].describe()['25%']
      third_quartile = data['Site EUI (kBtu/ft²)'].describe()['75%']
      iqr = third_quartile - first_quartile  #Interquartile range
      data = data[(data['Site EUI (kBtu/ft²)'] > (first_quartile - 3 * iqr)) & 
                  (data['Site EUI (kBtu/ft²)'] < (third_quartile + 3 * iqr))]
      View Code
    • Histogram of the target
      ### Histogram of the Energy Star Score(the target)
      data = data.rename(columns = {'ENERGY STAR Score': 'score'})
      plt.style.use('fivethirtyeight')
      plt.hist(data['score'].dropna(), bins = 100, edgecolor = 'k')
      plt.xlabel('Score'); plt.ylabel('Number of Buildings')
      plt.title('Energy Star Score Distribution') 
      View Code

    • Correlations between the target and numerical variables
      # Find all correlations and sort
      correlations_data = data.corr()['score'].sort_values()
      # Print the most negative correlations
      print(correlations_data.head(15), '
      ')
      # Print the most positive correlations
      print(correlations_data.tail(15))
      View Code
    • Plot distributions of the target for a categorical variable
      # Create a list of building types with more than 100 observations
      types = data.dropna(subset=['score'])
      types = types['Largest Property Use Type'].value_counts()
      types = list(types[types.values > 100].index)
      # Plot each building
      sns.set(font_scale = 1)
      figsize(6, 5)
      for b_type in types:
          # Select the building type
          subset = data[data['Largest Property Use Type'] == b_type]  
          # Density plot of Energy Star scores
          sns.kdeplot(subset['score'].dropna(), label = b_type, shade = False, alpha = 0.8)
      # label the plot
      plt.xlabel('Energy Star Score', size = 10)
      plt.ylabel('Density', size = 10)
      plt.title('Density Plot of Energy Star Scores by Building Type', size = 14)
      View Code

    • Visualization of the target vs a numerical variable and a categorical variable
      temp = data.dropna(subset=['score'])
      # Limit to building types with more than 100 observations
      temp = temp[temp['Largest Property Use Type'].isin(types)]
      # Visualization
      figsize(9, 7.5)
      sns.set(font_scale = 2)
      sns.lmplot('Site EUI (kBtu/ft²)', 'score', hue = 'Largest Property Use Type', data = temp, 
                 scatter_kws = {'alpha': 0.8, 's': 60}, fit_reg = False, size = 12, aspect = 1.2)
      # Plot labeling
      plt.xlabel("Site EUI", size = 28)
      plt.ylabel('Energy Star Score', size = 28)
      plt.title('Energy Star Score vs Site EUI', size = 36)
      View Code

    • Pair Plot
      # Extract the columns to  plot
      plot_data = data[['score', 'Site EUI (kBtu/ft²)', 'Weather Normalized Source EUI (kBtu/ft²)']]
      # Replace the inf with nan
      plot_data = plot_data.replace({np.inf: np.nan, -np.inf: np.nan})
      # Rename columns
      plot_data = plot_data.rename(columns = {'Site EUI (kBtu/ft²)': 'Site EUI', 
                                              'Weather Normalized Source EUI (kBtu/ft²)': 'Weather Norm EUI'})
      # Drop na values
      plot_data = plot_data.dropna()
      # Function to calculate correlation coefficient between two columns
      def corr_func(x, y, **kwargs):
          r = np.corrcoef(x, y)[0][1]
          ax = plt.gca()
          ax.annotate("r = {:.2f}".format(r), xy=(.2, .8), xycoords=ax.transAxes, size = 20)
      # Create the pairgrid object
      figsize(9,7.5)
      sns.set(font_scale = 1)
      grid = sns.PairGrid(data = plot_data, height = 3)
      # Upper is a scatter plot
      grid.map_upper(plt.scatter, color = 'red', alpha = 0.6)
      # Diagonal is a histogram
      grid.map_diag(plt.hist, color = 'red', edgecolor = 'black')
      # Bottom is correlation and density plot
      grid.map_lower(corr_func);
      grid.map_lower(sns.kdeplot, cmap = plt.cm.Reds)
      # Title for entire plot
      plt.suptitle('Pairs Plot of Energy Data', size = 24, y = 1.02)
      View Code

    2. 特征工程和选择

    • 特征工程
      ### Extract the buildings with no score and the buildings with a score
      numeric_cols = data.columns[data.dtypes!=object].tolist()
      categorical_cols = ['Borough', 'Largest Property Use Type']
      no_score = data[numeric_cols+categorical_cols][data['score'].isna()] #for prediction
      score = data[numeric_cols+categorical_cols][data['score'].notnull()]
      ### Separate out the features and targets
      features = score.drop(columns='score')
      targets = pd.DataFrame(score['score'])
      numeric_cols.remove('score')
      ### Split into 70% training and 30% testing set
      X, X_test, y, y_test = train_test_split(features, targets, test_size = 0.3, random_state = 42)
      ### Create columns with log of numeric columns
      def add_log(df):
          temp = df.copy()
          for col in numeric_cols:
              temp['log_' + col] = np.sign(df[col])*np.log(np.abs(df[col])+1)
          return temp
      ### Apply numeric imputer and min-max transform
      cols = numeric_cols+['log_'+col for col in numeric_cols]
      numeric_imputer = [([feature], [Imputer(strategy="median"),MinMaxScaler(feature_range=(0, 1))]) 
                         for feature in cols]
      ### Apply categorical imputer and one-hot encode
      category_imputer = [(feature, [CategoricalImputer(strategy='constant', fill_value='Missing'),LabelBinarizer()]) 
                          for feature in categorical_cols]
      ### union mapper
      mapper = DataFrameMapper(numeric_imputer+category_imputer, input_df=True, df_out=True)
      ### feature engineer pipeline
      fea_engine = Pipeline([("add_log", FunctionTransformer(add_log, validate=False)), 
                             ("num_cat_mapper", mapper)])
      X_engine = fea_engine.fit_transform(X)
      X_test_engine = fea_engine.transform(X_test)
      View Code
    • 特征选择
      ### Remove collinear features in a dataframe with a correlation coefficient greater than the threshold.
      ### Removing collinear features can help a model to generalize and improves the interpretability of the model.
      ### pakage feature_selector: https://github.com/WillKoehrsen/feature-selector
      from feature_selector import FeatureSelector
      cols = numeric_cols+['log_'+col for col in numeric_cols]
      fs = FeatureSelector(data = X_engine[cols], labels = y)
      fs.identify_collinear(correlation_threshold=0.8)
      correlated_features = fs.ops['collinear']
      print(fs.record_collinear) #打印相关系数的详细信息
      X_select = X_engine.drop(columns = correlated_features)
      X_test_select = X_test_engine.drop(columns = correlated_features)
      ### Write to files for next modeling
      no_score.to_csv('data/no_score.csv', index = False)
      X_select.to_csv('data/training_features.csv', index = False)
      X_test_select.to_csv('data/testing_features.csv', index = False)
      y.to_csv('data/training_labels.csv', index = False)
      y_test.to_csv('data/testing_labels.csv', index = False)
      View Code

    3. 机器学习模型比较

    from sklearn.linear_model import LinearRegression
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.svm import SVR
    from sklearn.neighbors import KNeighborsRegressor
    ### Read data
    train_features = pd.read_csv('data/training_features.csv')
    test_features = pd.read_csv('data/testing_features.csv')
    train_labels = pd.read_csv('data/training_labels.csv')
    test_labels = pd.read_csv('data/testing_labels.csv')
    ### Function to calculate mean absolute error
    def mae(y_true, y_pred):
        return np.mean(abs(y_true - y_pred))
    ### Create a baseline
    baseline_guess = np.median(train_labels)
    print('The baseline guess is a score of %0.2f' % baseline_guess)
    print("Baseline Performance on the test set: MAE = %0.4f" % mae(test_labels, baseline_guess))
    ### Model Comparison
    def fit_and_evaluate(model):
        model.fit(train_features, train_labels.values.reshape((-1,))) #train the model   
        model_pred = model.predict(test_features) #predict the model
        model_mae = mae(test_labels.values.reshape((-1,)), model_pred) #compute the metric
        return model_mae
    lr = LinearRegression() #Linear Regression
    lr_mae = fit_and_evaluate(lr)
    print('Linear Regression Performance on the test set: MAE = %0.4f' % lr_mae)
    svm = SVR(C = 1000, gamma = 0.1)
    svm_mae = fit_and_evaluate(svm) #SVR
    print('Support Vector Machine Regression Performance on the test set: MAE = %0.4f' % svm_mae)
    random_forest = RandomForestRegressor(random_state=60) #Random Forest
    random_forest_mae = fit_and_evaluate(random_forest)
    print('Random Forest Regression Performance on the test set: MAE = %0.4f' % random_forest_mae)
    gradient_boosted = GradientBoostingRegressor(random_state=60) #GBM
    gradient_boosted_mae = fit_and_evaluate(gradient_boosted)
    print('Gradient Boosted Regression Performance on the test set: MAE = %0.4f' % gradient_boosted_mae)
    knn = KNeighborsRegressor(n_neighbors=10) #KNN
    knn_mae = fit_and_evaluate(knn)
    print('K-Nearest Neighbors Regression Performance on the test set: MAE = %0.4f' % knn_mae)
    ### Visualization
    plt.style.use('fivethirtyeight')
    figsize(8, 6)
    model_comparison = pd.DataFrame({'model': ['Linear Regression', 'SVR', 'RF', 'GBM', 'KNN'], 
                                     'mae': [lr_mae, svm_mae, random_forest_mae, gradient_boosted_mae, knn_mae]}) # Dataframe to hold the results
    model_comparison.sort_values('mae', ascending = False).plot(x = 'model', y = 'mae', 
                                                                kind = 'barh', color = 'red', edgecolor = 'black') # Horizontal bar chart of test mae
    plt.ylabel(''); plt.yticks(size = 14); plt.xlabel('Mean Absolute Error'); plt.xticks(size = 14)
    plt.title('Model Comparison on Test MAE', size = 20)
    View Code

    4. 超参数调优

    from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
    ### Create the model to use for hyperparameter tuning
    model = GradientBoostingRegressor(random_state=60)
    ### Set tuned hyperparameters
    loss = ['ls', 'lad', 'huber']
    n_estimators = [100, 500, 900, 1100, 1500]
    max_depth = [2, 3, 5, 10, 15]
    min_samples_leaf = [1, 2, 4, 6, 8]
    min_samples_split = [2, 4, 6, 10]
    max_features = ['sqrt', 'log2', None]
    hyperparameter_grid = {'loss': loss, 'n_estimators': n_estimators, 'max_depth': max_depth, 
                           'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 
                           'max_features': max_features}
    ### Set up the random search with 4-fold cross validation
    random_cv = RandomizedSearchCV(estimator=model, param_distributions=hyperparameter_grid, 
                                   cv=4, n_iter=25, scoring='neg_mean_absolute_error', n_jobs=-1, 
                                   verbose=1, random_state=60)
    random_cv.fit(train_features, train_labels.values.reshape((-1,)))
    print(random_cv.best_estimator_)
    ### Further grid search for the model
    model = random_cv.best_estimator_
    trees_grid = {'n_estimators': [500, 600, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100]}
    grid_search = GridSearchCV(estimator=model, param_grid=trees_grid, cv=4, verbose=1, 
                               scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(train_features, train_labels.values.reshape((-1,)))
    ### Plot the training and testing error vs number of trees
    results = pd.DataFrame(grid_search.cv_results_)
    figsize(8, 8)
    plt.style.use('fivethirtyeight')
    plt.plot(results['param_n_estimators'], -1 * results['mean_test_score'], label = 'Testing Error')
    plt.plot(results['param_n_estimators'], -1 * results['mean_train_score'], label = 'Training Error')
    plt.xlabel('Number of Trees'); plt.ylabel('Mean Abosolute Error'); plt.legend()
    plt.title('Performance vs Number of Trees') #下图
    ### 由下图可以看出过拟合现象,因此再进行一轮grid search,希望能够改善过拟合
    of_grid = {'n_estimators': [100,200,300,400,500], 'min_samples_leaf': [1,2], 'min_samples_split': [2,4], 'max_depth': [3,5]}
    grid_search_of = GridSearchCV(estimator=grid_search.best_estimator_, param_grid=of_grid, cv=4, verbose=1, 
                                  scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_of.fit(train_features, train_labels.values.reshape((-1,)))
    ### Final Model
    final_model = grid_search_of.best_estimator_
    print(final_model)
    final_mae = fit_and_evaluate(final_model)
    print('Final model performance on the test set:   MAE = %0.4f.' % final_mae) #MAE: 9.02
    View Code

    5. 模型评估和解释

    • 预测缺失的评分
      no_score = pd.read_csv('data/no_score.csv').drop(columns='score')
      no_score_engine = fea_engine.transform(no_score)
      no_score_select = no_score_engine.drop(columns = correlated_features)
      final_model = grid_search_of.best_estimator_
      final_model.fit(train_features, train_labels.values.reshape((-1,)))
      score_preds = final_model.predict(no_score_select)
      score_preds = np.where(score_preds>100, 100, score_preds)
      score_preds = np.where(score_preds<0, 0, score_preds)
      ### Plot
      plt.style.use('fivethirtyeight')
      plt.hist(score_preds, bins = 100, edgecolor = 'k')
      plt.xlabel('Predicted Score'); plt.ylabel('Number of Buildings')
      plt.title('Energy Star Score Distribution')
      View Code

    • 特征重要性
      # Extract the feature importances into a dataframe
      feature_results = pd.DataFrame({'feature': list(train_features.columns), 'importance': final_model.feature_importances_})
      # Show the top 10 most important
      feature_results = feature_results.sort_values('importance', ascending = False).reset_index(drop=True)
      print(feature_results.head(10))
      View Code
    • Locally Interpretable Model-agnostic Explanations(LIME)
      import lime
      import lime.lime_tabular
      ### Find the residuals
      residuals = abs(final_model.predict(test_features) - test_labels.values.reshape((-1,)))
      ### Extract the worst prediction
      wrong = test_features.values[np.argmax(residuals), :]
      ### Create a lime explainer object
      explainer = lime.lime_tabular.LimeTabularExplainer(training_data = train_features.values, 
                                                         categorical_features=list(range(25,len(train_features.columns))), 
                                                         mode = 'regression', 
                                                         training_labels = train_labels.values.reshape((-1,)), 
                                                         feature_names = list(train_features.columns))
      ### Explanation for wrong prediction
      print('Prediction: %0.4f' % final_model.predict(wrong.reshape(1, -1)))
      print('Actual Value: %0.4f' % test_labels.values.reshape((-1,))[np.argmax(residuals)])
      wrong_exp = explainer.explain_instance(data_row = wrong, predict_fn = final_model.predict)
      ### Plot the prediction explaination
      wrong_exp.as_pyplot_figure()
      plt.title('Explanation of Prediction for the Wrong Case', size = 28)
      plt.xlabel('Effect on Prediction', size = 22)
      View Code

      关于LIME的介绍可参考这篇文章,上述代码仅分析了模型预测最不准确的那个例子。从下图可以看出在该例中模型预测值偏低的主要原因是Site EUI以及Weather Normalized Site Electricity Intensity的值较高;这两个值越高,建筑物的节能之星评分就越低,这是模型经过训练所总结出来的性质。在该例中虽然这两个值很高,但是建筑物的实际节能之星评分也很高,这就与模型经过大量数据训练所得到的经验相悖,最终造成了较大的预测误差。

  • 相关阅读:
    《java入门第一季》之面向对象(static关键字)
    《java入门第一季》之面向对象(面向对象案例详解)
    《java入门第一季》之面向对象面试题(面向对象都做了哪些事情)
    《java入门第一季》之面向对象(成员方法)
    《android入门第一季》之android目录结构详解
    Vue 中的 Props 与 Data 细微差别,你知道吗?
    使用Vue 3.0做JSX(TSX)风格的组件开发
    vue中Axios的封装和API接口的管理
    在 Vue.js 中制作自定义选择组件
    webpack打包原理
  • 原文地址:https://www.cnblogs.com/sunwq06/p/11152010.html
Copyright © 2020-2023  润新知