• kaggle 加州房价预测


    导入数据

    train_content = pd.read_csv("./house_prices/data/train.csv")
    test_content = pd.read_csv("./house_prices/data/test.csv")
    train_content.head()
    

    image-20211108160129616

    特征工程

    将train 数据和 test 数据合并,一块处理特征

    all_features = pd.concat((train_content.iloc[:, 1:-1], test_content.iloc[:, 1:]))
    numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index # 得到非object的索引列
    all_features[numeric_features] = all_features[numeric_features].apply(lambda x:(x- x.mean())/(x.std())) # 标准化
    all_features[numeric_features] = all_features[numeric_features].fillna(0) # nan元素填充为0
    

    现在来考虑非数字的那些列,这里用one_hot 的方法来处理

    all_features = pd.get_dummies(all_features, dummy_na=True)
    all_features.shape
    

    (2919, 331)

    接着得到train_features 和 labels

    n_train = train_content.shape[0]
    train_features = torch.tensor(all_features.iloc[0:n_train,:].values, dtype=d2l.float32)
    train_labels = torch.tensor(train_content.iloc[:, -1].values.reshape(-1, 1), dtype=d2l.float32)
    test_features = torch.tensor(all_features.iloc[n_train:, :].values, dtype=d2l.float32)
    train_labels
    

    在这里我犯了一个严重的错误,直接导致我后续训练误差一直减不下的。正常的训练误差是0.15, 而我的训练误差为0.4,我排查了很久,最终发现,问题出在了train_label 上。

    刚开始获得labels 的代码。

    train_labels = torch.tensor(train_content.iloc[:, -1].values, dtype = d2l.float32)
    train_labels
    

    tensor([208500., 181500., 223500., ..., 266500., 142125., 147500.])

    reshape 的代码

    train_labels = torch.tensor(train_content.iloc[:, -1].values.reshape(-1, 1), dtype=d2l.float32)
    train_labels
    

    image-20211108161120484

    无论是第一种方式获得的labels 还是 第二种方式获得的 labels ,均可以用torch进行计算, 而且不会有报错……这是一个非常蛋疼的点

    搭建模型

    我刚开始准备搭建一个三层的MLP,但是它的性能还没有一层全连接网络好,无语,这还深度学习楞……

    class LinearLay(nn.Module):
        def __init__(self, indim, outdim):
            super(LinearLay, self).__init__()
            self.weight = nn.Parameter(torch.FloatTensor(indim, outdim))
            self.bias = nn.Parameter(torch.FloatTensor(outdim))
            nn.init.xavier_uniform_(self.weight)
            nn.init.zeros_(self.bias)
    
        def forward(self, x):
            return torch.matmul(x, self.weight) + self.bias
    
    class Linear(nn.Module):
        def __init__(self, indim, hiddim, outdim, dropout):
            super(Linear, self).__init__()
            self.Lay1 = LinearLay(indim, 128)
            self.Lay2 = LinearLay(128, 16)
            self.Lay3 = LinearLay(16, outdim)
            self.dropout = dropout
        
        def forward(self, x):
            x = F.tanh(self.Lay1(x))
            x = F.dropout(x, self.dropout, training=self.training)
            x = F.tanh(self.Lay2(x))
            x = F.dropout(x, self.dropout, training=self.training)
            out = self.Lay3(x)
            return out
    

    直接一层全连接层

    net = nn.Linear(train_features.shape[1], 1)
    

    定义log rmse, 这个评价指标有点像accuracy,它和训练的那个MSEloss功能不一样的。

    def log_rmse(net, loss, features, labels):
        clipped_preds = torch.clamp(net(features), 1, float('inf')) #把小于1的数值全部定义为1,这个函数好用
        rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
        return rmse.item()
    

    训练函数

    def train(net, train_features, train_labels, test_features, test_labels,
        num_epochs, lr, wd, batch_size, cuda=True):
        if cuda: # 放在gpu上训练
            net.cuda()
            train_features = train_features.cuda()
            train_labels = train_labels.cuda()
            if test_features is not None:
            	test_features = test_features.cuda()
            	test_labels = test_labels.cuda()
    
        net.train()
        train_ls, test_ls = [], []
        dataset = data.TensorDataset(train_features, train_labels)
        data_iter = data.DataLoader(dataset, batch_size, shuffle=True)
        # 用Adam优化器
        optimizer = torch.optim.Adam(net.parameters(), lr= lr, weight_decay=wd)
        loss = nn.MSELoss()
        for epoch in range(num_epochs):
            for x, y in data_iter:
                optimizer.zero_grad()
                loss_train = loss(net(x), y)
                loss_train.backward()
                optimizer.step()
            train_ls.append(log_rmse(net, loss, train_features, train_labels))
            if test_features is not None:
            	test_ls.append(log_rmse(net, loss, test_features, test_labels))
        return train_ls, test_ls
    

    这里有个主意的点, net 放到gpu是直接net.cuda(),tensor 放到gpu上要train_features = train_features.cuda()

    K折交叉验证

    def get_K_fold_data(k, i, X, y):
        assert k > 1
        fold_size = X.shape[0] // k
        X_train, y_train = None, None
        for j in range(k):
            idx = slice(j*fold_size, (j+1)*fold_size)
            X_part, y_part = X[idx, :], y[idx]
            if j == i:
                X_val, y_val = X_part, y_part
            elif X_train is None:
                X_train, y_train = X_part, y_part
            else:
                X_train = torch.cat([X_train, X_part], dim=0)
                y_train = torch.cat([y_train, y_part], dim=0)
        return X_train, y_train, X_val, y_val
    
    def K_flod(indim, k, features, labels, num_epochs, lr, wd, batch_size):
        train_ls_sum, test_ls_sum = 0.0, 0.0
        cuda = torch.cuda.is_available()
        for i in range(k):
            net = nn.Linear(indim, 1) # 每一次单独用一个net
            data = get_K_fold_data(k, i, features, labels)
            train_ls, test_ls = train(net, *data, num_epochs, lr, wd, batch_size, cuda)
            train_ls_sum += train_ls[-1]
            test_ls_sum += test_ls[-1]
            if i==0:
               d2l.plot(list(range(1, num_epochs+1)), [train_ls, test_ls], xlabel='epochs',
                   ylabel='rmse', xlim=[1, num_epochs],
                       legend=['train', 'valid'], yscale='log')
    
            print('flod {}, train log rmse {} valid log rmse {}'.format(i+1, train_ls[-1], test_ls[-1]))
    
        return train_ls_sum/k, test_ls_sum/k
    

    训练

    indim = train_features.shape[1]
    lr = 10
    batch_size = 64
    wd = 5e-5
    num_epochs = 200
    t = time.time()
    k = 5
    train_ls, val_ls = K_flod(indim, k, train_features, train_labels, num_epochs, lr, wd, batch_size)
    print('{}-折交叉验证,平均训练log rmse: {} 平均验证 log rmse:{}'.format(k, train_ls, val_ls))
    print("耗时:{}s".format(time.time()-t))
    

    image-20211108163010714

    像那么一回事,不是吗?

    训练并预测

    net = nn.Linear(indim, 1)
    train_ls,_ = train(net, train_features, train_labels, None, None, 200, 10, 5e-5, 64) # 直接训练
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
                 ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'train log rmse {float(train_ls[-1]):f}')
        # 将网络应用于测试集。
    test_features = test_features.cuda() # net在cuda上
    net.eval()
    preds = net(test_features).to('cpu').detach().numpy() # 要用detach,因为net(x)计算得到的Tensor是有梯度grad的,要想detach然后才能转为ndarray。
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)
    

    image-20211108163801456

  • 相关阅读:
    泛型
    事件和委托
    参数和属性
    常量、字段、和方法
    LINQ
    LINQ- 子查询、组合策略、投影策略
    MySQL全局锁和表锁
    CLR基础
    LINQ
    LINQ
  • 原文地址:https://www.cnblogs.com/kalicener/p/15524877.html
Copyright © 2020-2023  润新知