• 练习1车费预测


    image
    image

    源代码:

    # %%
    '''
    步骤:
    1、读入数据集,将车费、经纬度进行清洗
    (使用plt画散点图(省略))
    2、用sklearn进行预测
    '''
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import sklearn
    
    
    train = pd.read_csv(r"C:\Users\Administrator\纽约出租车车费预测\train.csv",nrows=1000000)
    
    
    train.head()
    
    
    train.describe() # 发现车费min为负,经度纬度、乘客数的max过大
    
    
    train.shape # 原始数据集大小
    
    
    train.drop(train[train.isna().any(1)].index, axis=0, inplace = True) # 删除任何有nan的行
    
    
    train.shape # 删除nan之后的大小
    
    
    # # 清洗乘客数
    
    
    train["passenger_count"].describe()
    
    
    train["passenger_count"].value_counts().sort_values(ascending=True) # 寻找人数异常值的个数
    
    
    train.drop(train[(train['passenger_count'] > 6) | (train['passenger_count'] == 0)].index, inplace = True, axis = 0) #drop异常人数值
    
    train["passenger_count"].value_counts().sort_values(ascending=True)
    
    
    # # 清洗经纬度
    eps = 1e-7
    train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \
          (train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\
         ] # 很多起始位置基本小数点前6位没有发生变化
    
    
    # 与describe里的经纬度对比,需要把一些离平均值很远的行去掉
    for name in train.columns[3:7]:
        train.drop(train[(train[name] < train[name].mean()-10) | (train[name] > train[name].mean() + 10)].index\
        , axis = 0, inplace = True)
    
    
    train.describe()
    
    # %% [markdown]
    # # 清洗车费
    
    
    train["fare_amount"].value_counts().sort_index(ascending=True) # 计数后按车费排序
    
    
    train.drop(train[train["fare_amount"] < eps].index, inplace = True, axis = 0)
    
    
    train["fare_amount"].describe() # 认为大于0即为合法数据
    
    
    train.describe() # 除了车费,其他数据方差很小,说明异常值基本去除
    
    
    
    # # 导入测试集,并给训练集和测试集加入有关时间的列
    
    
    test = pd.read_csv(r"C:\Users\Administrator\Desktop\纽约出租车车费预测\test.csv")
    
    
    
    
    # 转时间类型
    train['key'] = pd.to_datetime(train['key'])
    train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
    test['key'] = pd.to_datetime(test['key'])
    test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
    
    
    train.dtypes
    
    # 增加列
    train['year'] = train['pickup_datetime'].dt.year
    train['month'] = train['pickup_datetime'].dt.month
    train['day'] = train['pickup_datetime'].dt.day
    train['hour'] = train['pickup_datetime'].dt.hour
    train['day of week'] = train['pickup_datetime'].dt.dayofweek
    test['year'] = test['pickup_datetime'].dt.year
    test['month'] = test['pickup_datetime'].dt.month
    test['day'] = test['pickup_datetime'].dt.day
    test['hour'] = test['pickup_datetime'].dt.hour
    test['day of week'] = test['pickup_datetime'].dt.dayofweek
    
    
    train.dtypes
    
    test.dtypes
    
    
    # # 计算路程以及每mile的车费(预测时没用到,因为是预测test的车费)
    
    
    
    
    def distance(lat1, long1, lat2, long2):
        data = [train, test]
        for i in data:
            R = 6371  # 地球半径(单位:千米)
            phi1 = np.radians(i[lat1])
            phi2 = np.radians(i[lat2])
        
            delta_phi = np.radians(i[lat2]-i[lat1])
            delta_lambda = np.radians(i[long2]-i[long1])
        
            #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
            a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
        
            #c = 2 * atan2( √a, √(1−a) )
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        
            #d = R*c
            d = (R * c) # 单位:千米
            i['H_Distance'] = d
        return d
    
    distance('pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')
    
    '''eps = 1e-7
    train.drop( train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \
          (train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\
         ].index,inplace = True, axis = 0) # 去除没有动的点
    '''
    eps = 1e-7
    train.drop(train[(train['H_Distance']< eps) & (train['H_Distance'] > -eps)].index, inplace=True, axis=0)
    
    train["fare_pre_mile"] = train.fare_amount / train.H_Distance # 每mile的价钱
    
    
    train
    
    
    train["fare_pre_mile"].describe()
    
    for i in range(0,20): # 发现大于平均值以后的数量占比很小,考虑由于个别异常值导致平均值过大
        print(train[train["fare_pre_mile"] > train["fare_pre_mile"].mean()+i]["fare_pre_mile"].count())
    
    
    # 首先去除油价搞的离谱的
    train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+1000)].index, inplace=True, axis=0)
    train["fare_pre_mile"].describe()
    
    
    train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
    train["fare_pre_mile"].describe()
    
    train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
    train["fare_pre_mile"].describe()
    
    train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+50)].index, inplace=True, axis=0)
    train["fare_pre_mile"].describe()
    
    # 发现平均值基本稳定了,油价也比较接近常识
    
    for i in range(0,20): # 遍历每mile油费大于i的count
        print(i," : ",train[train["fare_pre_mile"] > i]["fare_pre_mile"].count())
    
    # 去掉后面一部分
    train.drop(train[(train['fare_pre_mile'] > 8)].index, inplace=True, axis=0)
    
    # 再去除小于1的
    train.drop(train[(train['fare_pre_mile'] < 1)].index, inplace=True, axis=0)
    
    train['fare_pre_mile'].describe()
    
    
    # 预测
    
    from sklearn.linear_model import SGDRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler # 标准化
    
    x_train = train.drop(["key","pickup_datetime","fare_amount","fare_pre_mile"],1) # 训练集数据
    y_train = train["fare_amount"] # 训练集结果
    x_test = test.drop(["key","pickup_datetime"],1)
    
    
    std_x = StandardScaler()
    x_train = std_x.fit_transform(x_train)
    x_test = std_x.fit_transform(x_test)
    
    std_y = StandardScaler()
    y_train = std_y.fit_transform(np.array(y_train).reshape(-1,1))
    
    
    x_train.shape
    
    y_train.shape
    
    x_test.shape
    
    # 梯度下降预测
    
    sgd = SGDRegressor()
    
    y_train = y_train.ravel()
    sgd.fit(x_train,y_train)
    
    y_sgd_predict = sgd.predict(x_test)
    y_sgd_predict = std_y.inverse_transform(y_sgd_predict)
    
    y_sgd_predict
    
    test["fare_amount"]=y_sgd_predict
    
    train
    
    
  • 相关阅读:
    JasperReport
    Linux
    Linux
    Linux
    Linux
    Groovy
    Linux
    VS
    Aliyun
    Linux
  • 原文地址:https://www.cnblogs.com/ruanbaiQAQ/p/15686443.html
Copyright © 2020-2023  润新知