    1 实验环境



    什么是 Colaboratory?

    借助 Colaboratory(简称 Colab),您可在浏览器中编写和执行 Python 代码,并且:

    • 无需任何配置
    • 免费使用 GPU
    • 轻松共享

    无论您是一名学生数据科学家还是 AI 研究员,Colab 都能够帮助您更轻松地完成工作。您可以观看 Colab 简介了解详情,或查看入门指南!

    对于 Colab 笔记本,您可以将可执行代码富文本以及图像HTMLLaTeX 等内容合入 1 个文档中。当您创建自己的 Colab 笔记本时,系统会将这些笔记本存储在您的 Google 云端硬盘帐号名下。您可以轻松地将 Colab 笔记本共享给同事或好友,允许他们评论甚至修改笔记本。要了解详情,请参阅 Colab 概览。要创建新的 Colab 笔记本,您可以使用上方的“文件”菜单,也可以使用以下链接:创建新的 Colab 笔记本

    Colab 笔记本是由 Colab 托管的 Jupyter 笔记本。如需详细了解 Jupyter 项目,请访问 jupyter.org

    使用过程中记得在 菜单栏>代码执行程序>更改运行时类型 中打开使用GPU加速

    2 实验

    2.1 环境配置和导入

    !pip install torch
    !pip install torchtext
    !python -m spacy download en
    # K80 gpu for 12 hours
    import torch
    from torch import nn, optim
    from torchtext import data, datasets
    print('GPU:', torch.cuda.is_available())
    2.2 设置数据集

    TEXT = data.Field(tokenize='spacy')
    LABEL = data.LabelField(dtype=torch.float)
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    # IMDB是torchtext提供的数据集
    print('len of train data:', len(train_data))
    print('len of test data:', len(test_data))
    len of train data: 25000
    len of test data: 25000
    ['First', 'I', 'was', 'caught', 'totally', 'off', 'guard', 'by', 'the', 'film', "'s", 'initial', 'lyricism', 'and', 'then', 'I', 'became', 'totally', 'enchanted', 'with', 'the', 'unfolding', 'story', 'and', 'engrossed', 'with', 'the', 'brilliant', 'directing', '.', 'The', 'characters', 'were', 'all', 'fully', 'developed', ',', 'not', 'bigger', '-', 'than', '-', 'life', 'but', 'just', 'like', 'the', 'people', 'we', 'live', 'among', 'anywhere', 'we', 'are', 'in', 'the', 'world', ',', 'in', 'Sweden', ',', 'in', 'Turkey', 'or', 'in', 'America', ',', 'all', 'completely', 'believable', 'human', 'beings', 'with', 'foibles', 'and', 'nobility', '.', 'Hollywood', 'could', 'learn', 'so', 'much', 'from', 'this', 'beautiful', 'film', '.', 'It', 'shows', 'that', 'there', 'is', 'no', 'need', 'to', 'go', 'into', 'every', 'little', 'detail', 'behind', 'every', 'action', 'to', 'bring', 'out', 'the', 'whole', 'theme', 'clear', 'and', 'bright', ',', 'and', 'that', 'shows', 'the', 'brilliance', 'of', 'the', 'director', '!', 'Hearfelt', 'thanks', 'to', 'Kay', 'Pollak', 'and', 'the', 'wonderful', 'cast', 'for', 'this', 'superb', 'treat', '!', '!']
    # word2vec, glove
    TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
    batchsz = 30
    device = torch.device('cuda')
    train_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, test_data),
        batch_size = batchsz,
    2.3 搭建lstm网络

    class RNN(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim):
            super(RNN, self).__init__()
            # [0-10001] => [100] vocab_size=10002 embedding_dim=100,就是说10002个单词其中是10000个真的单词还有一个是不认识的单侧另一个是特殊符号,每个单词用长度100的向量表示
            self.embedding = nn.Embedding(vocab_size, embedding_dim) 
            # [100] => [256]
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, 
                               bidirectional=True, dropout=0.5)  
            # [双向循环神经网络bidirectional介绍](https://shenxiaohai.me/2018/10/19/pytorch-tutorial-intermediate-04/)
            # [256*2] => [1]
            self.fc = nn.Linear(hidden_dim*2, 1)
            self.dropout = nn.Dropout(0.5)
        def forward(self, x):
            x: [seq_len, b] vs [b, 3, 28, 28]
            # [seq, b, 1] => [seq, b, 100]
            embedding = self.dropout(self.embedding(x))
            # output: [seq, b, hid_dim*2]
            # hidden/h: [num_layers*2, b, hid_dim]
            # cell/c: [num_layers*2, b, hid_di]
            output, (hidden, cell) = self.rnn(embedding)
            # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
            hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
            # [b, hid_dim*2] => [b, 1]
            hidden = self.dropout(hidden)
            out = self.fc(hidden)
            return out

    2.4 embedding和网络优化

    rnn = RNN(len(TEXT.vocab), 100, 256)
    # 转换成embedding的形式
    pretrained_embedding = TEXT.vocab.vectors
    print('pretrained_embedding:', pretrained_embedding.shape)
    print('embedding layer inited.')
    optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
    criteon = nn.BCEWithLogitsLoss().to(device)
    pretrained_embedding: torch.Size([10002, 100])
    embedding layer inited.
      (embedding): Embedding(10002, 100)
      (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
      (fc): Linear(in_features=512, out_features=1, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)

    2.5 训练与测试

    import numpy as np
    def binary_acc(preds, y):
        get accuracy
        preds = torch.round(torch.sigmoid(preds))
        correct = torch.eq(preds, y).float()
        acc = correct.sum() / len(correct)
        return acc
    def train(rnn, iterator, optimizer, criteon):
        avg_acc = []
        for i, batch in enumerate(iterator): # 遍历所有训练数据
            # [seq, b] => [b, 1] => [b]
            pred = rnn(batch.text).squeeze(1)
            loss = criteon(pred, batch.label)
            acc = binary_acc(pred, batch.label).item()
            if i%100 == 0:
                print(i, acc)
        avg_acc = np.array(avg_acc).mean()
        print('avg acc:', avg_acc)
    def eval(rnn, iterator, criteon):
        avg_acc = []
        with torch.no_grad():
            for batch in iterator:
                # [b, 1] => [b]
                pred = rnn(batch.text).squeeze(1)
                loss = criteon(pred, batch.label)
                acc = binary_acc(pred, batch.label).item()
        avg_acc = np.array(avg_acc).mean()
        print('>>test:', avg_acc)
    for epoch in range(10):
        eval(rnn, test_iterator, criteon)
        train(rnn, train_iterator, optimizer, criteon)
