• 【udacity】机器学习-波士顿房价预测

    import numpy as np
    import pandas as pd
    from Udacity.model_check.boston_house_price import visuals as vs # Supplementary code
    from sklearn.model_selection import ShuffleSplit
    # Pretty display for notebooks
    # 让结果在notebook中显示
    # Load the Boston housing dataset
    # 载入波士顿房屋的数据集
    data = pd.read_csv('housing.csv')
    prices = data['MEDV']
    features = data.drop('MEDV', axis=1)
    # print(data.describe())
    # Success
    # 完成
    print("Boston housing dataset has {} data points with {} variables each.".format(*data.shape))
    # 目标:计算价值的最小值
    minimum_price = np.min(data['MEDV'])
    # 目标:计算价值的最大值
    maximum_price = np.max(data['MEDV'])
    # 目标:计算价值的平均值
    mean_price = np.mean(data['MEDV'])
    # 目标:计算价值的中值
    median_price = np.median(data['MEDV'])
    # 目标:计算价值的标准差
    std_price = np.std(data['MEDV'])
    # 目标:输出计算的结果
    print("Statistics for Boston housing dataset:
    print("Minimum price: ${:,.2f}".format(minimum_price))
    print("Maximum price: ${:,.2f}".format(maximum_price))
    print("Mean price: ${:,.2f}".format(mean_price))
    print("Median price ${:,.2f}".format(median_price))
    print("Standard deviation of prices: ${:,.2f}".format(std_price))
    # TODO: Import 'r2_score'
    def performance_metric(y_true, y_predict):
        """ Calculates and returns the performance score between
            true and predicted values based on the metric chosen. """
        from sklearn.metrics import r2_score
        # TODO: Calculate the performance score between 'y_true' and 'y_predict'
        score = r2_score(y_true,y_predict)
        # Return the score
        return score
    # score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])
    # print ("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.80, random_state=1)
    # Success
    print ("Training and testing split was successful.")
    # vs.ModelLearning(features, prices)
    def fit_model(X, y):
        """ Performs grid search over the 'max_depth' parameter for a
            decision tree regressor trained on the input data [X, y]. """
        from sklearn.tree import DecisionTreeRegressor
        from sklearn.model_selection import KFold
        # Create cross-validation sets from the training data
        cross_validator = KFold(10)
        # cv_sets = ShuffleSplit(X.shape[0],  test_size=0.20, random_state=0)
        # TODO: Create a decision tree regressor object
        regressor = DecisionTreeRegressor()
        # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
        max_depth = [1,2,3,4,5,6,7,8,9,10]
        params = {"max_depth":max_depth}
        from sklearn.metrics import make_scorer
        # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
        scoring_fnc = make_scorer(performance_metric)
        from sklearn.model_selection import GridSearchCV
        # TODO: Create the grid search object
        grid = GridSearchCV(regressor,params,scoring_fnc,cv=cross_validator)
        # Fit the grid search object to the data to compute the optimal model
        grid = grid.fit(X, y)
        # Return the optimal model after fitting the data
        return grid.best_estimator_
    reg = fit_model(X_train, y_train)
    # Produce the value for 'max_depth'
    print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
    client_data = [[5, 17, 15], # Client 1
                   [4, 32, 22], # Client 2
                   [8, 3, 12]]  # Client 3
    # Show predictions
    for i, price in enumerate(reg.predict(client_data)):
        print ("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
