• Pytorch-seq2seq机器翻译模型+attention


    笔记摘抄

    语料链接:https://pan.baidu.com/s/1wpP4t_GSyPAD6HTsIoGPZg
    提取码:jqq8

    数据格式如图:

    导包:

    import os
    import sys
    import math
    from collections import Counter
    import numpy as np
    import random
    
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    
    import nltk
    nltk.download('punkt')
    

    1. 数据预处理

    1.1 读入中英文数据

    • 英文使用nltk的 word_tokenizer 来分词,并且使用小写字母

    • 中文直接使用单个汉字作为基本单元

    def load_data(in_file):
        cn = []
        en = []
        num_examples = 0
        with open(in_file, 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip().split('	')
                
                en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])
                cn.append(['BOS'] + [c for c in line[1]] + ['EOS'])
        
        return en, cn
    
    train_file = 'nmt/en-cn/train.txt'
    dev_file = 'nmt/en-cn/dev.txt'
    train_en, train_cn = load_data(train_file)
    dev_en, dev_cn = load_data(dev_file)
    

    查看返回的数据内容:

    print(dev_en[:2])
    print(dev_cn[:2])
    

    [['BOS', 'she', 'put', 'the', 'magazine', 'on', 'the', 'table', '.', 'EOS'], ['BOS', 'hey', ',', 'what', 'are', 'you', 'doing', 'here', '?', 'EOS']]
    [['BOS', '她', '把', '雜', '誌', '放', '在', '桌', '上', '。', 'EOS'], ['BOS', '嘿', ',', '你', '在', '這', '做', '什', '麼', '?', 'EOS']]

    1.2 构建单词表

    UNK_IDX = 0
    PAD_IDX = 1
    def build_dict(sentences, max_words = 50000):
        word_count = Counter()
        for sentence in sentences:
            for word in sentence:
                word_count[word] += 1
        
        ls = word_count.most_common(max_words)   # 词频前max_words个单词(降序)
        total_words = len(ls) + 2
        
        word_dict = {w[0] : index + 2 for index, w in enumerate(ls)}  # {单词:索引}, w[0]:单词, w[1]:词频
        word_dict['UNK'] = UNK_IDX
        word_dict['PAD'] = PAD_IDX
        
        return word_dict, total_words           # total_words所有单词数, 最大50002
    
    en_dict, en_total_words = build_dict(train_en)
    cn_dict, cn_total_words = build_dict(train_cn)
    inv_en_dict = {v: k for k, v in en_dict.items()}  # 英文; {索引 : 单词}
    inv_cn_dict = {v: k for k, v in cn_dict.items()}  # 中文; {索引 : 字}
    

    1.3 把单词全部转变成数字

    sort_by_len=True :是为了使得一个batch中的句子长度差不多,所以按长度排序。

    def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
        length = len(en_sentences)
        out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
        out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]
    
        # sort sentences by word 
        def len_argsort(seq):
            return sorted(range(len(seq)), key=lambda x: len(seq[x]))
    
        # 把中文和英文按照同样的顺序排序
        if sort_by_len:
            sorted_index = len_argsort(out_en_sentences)
            out_en_sentences = [out_en_sentences[i] for i in sorted_index]
            out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
        
        return out_en_sentences, out_cn_sentences
    
    train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
    dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)  # [[2, 168, 201, 4, 3], [], ...., [2, 5, 14, 13, 22, 9, 149, 17, 107, 24, 121, 16, 20, 267, 7, 181, 23, 15, 6, 422, 25, 220, 4, 3]]
    

    查看返回的数据内容:

    print(train_cn[2])
    print([inv_cn_dict[i] for i in train_cn[2]])
    print([inv_en_dict[i] for i in train_en[2]])
    

    [2, 982, 2028, 8, 4, 3]
    ['BOS', '祝', '贺', '你', '。', 'EOS']
    ['BOS', 'congratulations', '!', 'EOS']

    1.4 把全部句子分成batch

    # 函数返回:一个minibatch,每个句子的索引, [[11, 4, 3, 5], [16, 7, 5, 7], ...]]
    def get_minibatches(n, minibatch_size, shuffle=True):  # n是传进来的句子数
        idx_list = np.arange(0, n, minibatch_size)         # [0, 1, ..., n-1] 按minibatch_size大小分割
        if shuffle:
            np.random.shuffle(idx_list)
        minibatches = []
        for idx in idx_list:
            minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
        return minibatches
    

    查看上面函数的功能:

    get_minibatches(100, 15)
    

    [array([90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
    array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
    array([60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]),
    array([15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
    array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]),
    array([75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]),
    array([45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])]

    # seqs传入的是minibatches中的一个minibatch对应的batch_size个句子索引(嵌套列表),此处batch_size=64
    def prepare_data(seqs):   
        lengths = [len(seq) for seq in seqs]
        n_samples = len(seqs)                  # n_samples个句子
        max_len = np.max(lengths)              # batch_size个句子中最长句子长度
    
        x = np.zeros((n_samples, max_len)).astype('int32')
        x_lengths = np.array(lengths).astype('int32') # batch中原始句子长度
    
        for idx, seq in enumerate(seqs):
            x[idx, :lengths[idx]] = seq   # lengths[idx]: 每个句子的索引, 长度不够补0
        
        return x, x_lengths
    
    def gen_examples(en_sentences, cn_sentences, batch_size):
        minibatches = get_minibatches(len(en_sentences), batch_size)    
        all_ex = []
        for minibatch in minibatches:
            mb_en_sentences = [en_sentences[t] for t in minibatch]  # 一个batch中每个句子的对应编码,[[[2, 982, 8], [14,5,6],...]
            mb_cn_sentences = [cn_sentences[t] for t in minibatch]
            mb_x, mb_x_len = prepare_data(mb_en_sentences)          # 一个batch中每个句子的对应编码,长度不够补0; 一个batch中每个句子长度
            mb_y, mb_y_len = prepare_data(mb_cn_sentences)
            all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
        # 返回内容依次是 n / batch_size 个 (batch个句子编码,batch个英文句子长度,batch个中文句子编码,batch个中文句子长度)
        return all_ex   
    
    batch_size = 64
    train_data = gen_examples(train_en, train_cn, batch_size)
    dev_data = gen_examples(dev_en, dev_cn, batch_size)
    

    2. Encoder Decoder模型(没有Attention版本)

    2.1 定义计算损失的函数

    # masked cross entropy loss
    class LanguageModelCriterion(nn.Module):
        def __init__(self):
            super(LanguageModelCriterion, self).__init__()
    
        def forward(self, input, target, mask):
            # input: [64, 12, 3195] target: [64, 12]  mask: [64, 12]
            # input: (batch_size * seq_len) * vocab_size
            input = input.contiguous().view(-1, input.size(2))
            # target: batch_size * seq_len
            target = target.contiguous().view(-1, 1)
            mask = mask.contiguous().view(-1, 1)
            output = -input.gather(1, target) * mask  # 将input在1维,把target当索引进行取值
            #这里算得就是交叉熵损失,前面已经算了F.log_softmax
            #output.shape=torch.Size([768, 1])
            #因为input.gather时,target为0的地方不是零了,mask作用是把padding为0的地方重置为零,
            #因为在volab里0代表的也是一个单词,但是我们这里target尾部的0代表的不是单词
            output = torch.sum(output) / torch.sum(mask)
            # 均值损失,output前已经加了负号,所以这里还是最小化
            return output
    

    2.2 Encoder部分

    Encoder模型的任务是把输入文字传入embedding层和GRU层,转换成一些hidden states作为后续的context vectors;

    nn.utils.rnn.pack_padded_sequencenn.utils.rnn.pad_packed_sequence 的理解:http://www.mamicode.com/info-detail-2493083.html

    class PlainEncoder(nn.Module):
        def __init__(self, vocab_size, hidden_size, dropout=0.2):   # 假设embedding_size=hidden_size
            super(PlainEncoder, self).__init__()
            self.embed = nn.Embedding(vocab_size, hidden_size)
            self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True) # batch_first=True: [batch_size, seq_len, hidden_size]
            self.dropout = nn.Dropout(dropout)
            
        # x: 一个batch的每个句子的编码
        # lengths: 每个句子的原始编码长度(未补0的长度)
        # 最后一个hidden_state要取出来作为context vector,所以需要lengths
        def forward(self, x, lengths):    
            # (排序好元素,排序好元素下标)
            sorted_len, sorted_idx = lengths.sort(0, descending=True)  # 把batch里的seq按照长度降序排列 
            x_sorted = x[sorted_idx.long()]
            embedded = self.dropout(self.embed(x_sorted))
            
            # 句子padding到一样长度的(真实句长会比padding的短)
            # 为了rnn时能取到真实长度的最后状态,先pack_padded_sequence进行处理
            packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(),
                                                                batch_first=True)   
            # out:[batch, seq_len, hidden_zize]
            # hidden: [num_layers=1, batch, hidden_size]
            packed_out, hidden = self.rnn(packed_embedded)
            out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # 回到padding长度
            
            _, original_idx = sorted_idx.sort(0, descending=False)                  # 排序回原来的样子
            
            out = out[original_idx.long()].contiguous()            # [batch_size, seq_len, hidden_size]
            hidden = hidden[:, original_idx.long()].contiguous()   # [num_layers, batch_size, hidden_size]
            
    #         print("out.shape: ", out.shape, 'hidden.shape: ', hidden.shape)
            
            return out, hidden[[-1]]  # hidden[[-1]], 相当于out[:, -1]
    

    测试一下:(可注释掉)

    # 测试维度 
    p = PlainEncoder(en_total_words, 100)
    
    mb_x = torch.from_numpy(train_data[0][0]).long()
    mb_x_len = torch.from_numpy(train_data[0][1]).long()
    print("数据集:", mb_x.shape, mb_x_len.shape)
    
    o, h = p(mb_x, mb_x_len)
    
    print(o.shape, h.shape)
    print(o[:, -1].shape, '
    ', o[:, -1] == h)
    

    数据集: torch.Size([64, 14]) torch.Size([64])
    out.shape: torch.Size([64, 14, 100]) hidden.shape: torch.Size([1, 64, 100])
    torch.Size([64, 14, 100]) torch.Size([1, 64, 100])
    torch.Size([64, 100])
    tensor([[[True, True, True, ..., True, True, True],
    [True, True, True, ..., True, True, True],
    [True, True, True, ..., True, True, True],
    ...,
    [True, True, True, ..., True, True, True],
    [True, True, True, ..., True, True, True],
    [True, True, True, ..., True, True, True]]])

    2.3 Decoder部分

    Decoder会根据已经翻译的句子内容和context vectors,来决定下一个输出的单词;

    class PlainDecoder(nn.Module):
        def __init__(self, vocab_size, hidden_size, dropout=0.2):
            super(PlainDecoder, self).__init__()
            self.embed = nn.Embedding(vocab_size, hidden_size)
            self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)  # [batch_size, seq_len, hidden_size]
            self.fc = nn.Linear(hidden_size, vocab_size)
            self.dropout = nn.Dropout(dropout)
            
        # 和PlainEncoder的forward过程大致差不多,区别在于hidden_state不是0而是传入的
        # y:一个batch的每个中文句子编码
        # hid: hidden_state, context vectors
        def forward(self, y, y_lengths, hid):
            sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
            y_sorted = y[sorted_idx.long()]
            hid = hid[:, sorted_idx.long()]
            
            # [batch_size, y_lengths, embed_size=hidden_size]
            y_sorted = self.dropout(self.embed(y_sorted))
            
            packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(),
                                                       batch_first=True)
            out, hid = self.rnn(packed_seq, hid)
            unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
            
            _, original_idx = sorted_idx.sort(0, descending=False)  # 原来的索引升序
            output_seq = unpacked[original_idx.long()].contiguous() # [batch_size, y_length, hidden_size]
            hid = hid[:, original_idx.long()].contiguous()          # [1, batch_size, hidden_size]
            
            output = F.log_softmax(self.fc(output_seq), -1)         # [batch_size, y_lengths, vocab_size]
            
            return output, hid
            
    

    2.4 构建Seq2Seq模型

    构建Seq2Seq模型把encoder, attention, decoder串到一起;

    class PlainSeq2Seq(nn.Module):
        def __init__(self, encoder, decoder):
            super(PlainSeq2Seq, self).__init__()
            self.encoder = encoder 
            self.decoder = decoder
            
        def forward(self, x, x_lengths, y, y_lengths):
            encoder_cut, hid = self.encoder(x, x_lengths)
            output, hid = self.decoder(y, y_lengths, hid)
                
            return output, None
        
        
        def translate(self, x, x_lengths, y, max_length=10):
            encoder_cut, hid = self.encoder(x, x_lengths)
            preds = []
            batch_size = x.shape[0]
            attns = []
            # sample
            for i in range(max_length):
                # output: [batch_size, y_lengths, vocab_size]
                # 训练的时候y是一个句子,一起decoder训练
                # 测试的时候y是个一个词一个词生成的,所以这里的y是传入的第一个单词,这里是bos
                # 同理y_lengths也是1
                output, hid = self.decoder(y=y, y_lengths=torch.ones(batch_size).long().to(device), 
                                           hid=hid)
                #刚开始循环bos作为模型的首个输入单词,后续更新y,下个预测单词的输入是上个输出单词
                # output.shape = torch.Size([1, 1, 3195])
                # hid.shape = torch.Size([1, 1, 100])
    
                y = output.max(2)[1].view(batch_size, 1) 
                # .max(2)在第三个维度上取最大值,返回最大值和对应的位置索引,[1]取出最大值所在的索引
                preds.append(y)
                # preds = [tensor([[5]], device='cuda:0'), tensor([[24]], device='cuda:0'), ... tensor([[4]], device='cuda:0')]
            
            # torch.cat(preds, 1) = tensor([[ 5, 24,  6, 22,  7,  4,  3,  4,  3,  4]], device='cuda:0')
            return torch.cat(preds, 1), None
    

    3. 创建模型

    定义模型、损失、优化器。

    dropout = 0.2
    hidden_size = 100
    encode = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout)
    decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout)
    
    model = PlainSeq2Seq(encode, decoder)
    model = model.to(device)
    
    loss_fn = LanguageModelCriterion().to(device)
    optimizer = torch.optim.Adam(model.parameters())
    

    4. 训练模型

    def train(model, data, num_epochs=20):
        for epoch in range(num_epochs):
            model.train()      # 训练模式
            total_num_words = total_loss = 0.
            for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
                mb_x = torch.from_numpy(mb_x).to(device).long()
                mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
                
                mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()  # EOS之前
                mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()  # BOS之后
                
                mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
                mb_y_len[mb_y_len <= 0] = 1
                
                mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
                
                # [mb_y_len.max()]->[1, mb_y_len.max()]
                mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
                mb_out_mask = mb_out_mask.float()
                
                # (pre, target, mask)
                # mb_output是句子单词的索引
                loss = loss_fn(mb_pred, mb_output, mb_out_mask)
                
                num_words = torch.sum(mb_y_len).item()
                total_loss += loss.item() * num_words
                total_num_words += num_words
                
                # 更新模型
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
                optimizer.step()
                
                if it % 100 == 0:
                    print("Epoch: ", epoch, 'iteration', it, 'loss:', loss.item())
                
            
            print("Epoch", epoch, "Training loss", total_loss / total_num_words)
            
            if epoch % 5 == 0:
                evaluate(model, dev_data)
        
        torch.save(model.state_dict(), 'translate_model.pt')
    
    

    5. 评估模型

    def evaluate(model, data):
        model.eval()
        total_num_words = total_loss = 0.
        
        with torch.no_grad():
            
            for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
                mb_x = torch.from_numpy(mb_x).to(device).long()
                mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
                mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
                mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
                mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
                mb_y_len[mb_y_len<=0] = 1
    
                mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
    
                mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
                mb_out_mask = mb_out_mask.float()
    
                loss = loss_fn(mb_pred, mb_output, mb_out_mask)
    
                num_words = torch.sum(mb_y_len).item()
                total_loss += loss.item() * num_words
                total_num_words += num_words
                
        print("Evaluation loss", total_loss / total_num_words)
    

    训练100次:

    train(model, train_data, num_epochs=100)
    

    训练结果(training loss在不断下降):

    Epoch:  0 iteration 0 loss: 3.3647029399871826
    Epoch:  0 iteration 100 loss: 3.009509563446045
    Epoch:  0 iteration 200 loss: 3.782735824584961
    Epoch 0 Training loss 3.1535905243275186
    Evaluation loss 3.3979927680761692
    Epoch:  1 iteration 0 loss: 3.3019187450408936
    Epoch:  1 iteration 100 loss: 2.9146101474761963
    Epoch:  1 iteration 200 loss: 3.7248971462249756
    Epoch 1 Training loss 3.0795154243968996
    Epoch:  2 iteration 0 loss: 3.204010009765625
    Epoch:  2 iteration 100 loss: 2.863368511199951
    Epoch:  2 iteration 200 loss: 3.6527459621429443
    Epoch 2 Training loss 3.0103434118084182
    Epoch:  3 iteration 0 loss: 3.146893262863159
    Epoch:  3 iteration 100 loss: 2.759276866912842
    Epoch:  3 iteration 200 loss: 3.589343309402466
    Epoch 3 Training loss 2.9467000284848877
    Epoch:  4 iteration 0 loss: 3.1050117015838623
    Epoch:  4 iteration 100 loss: 2.708840847015381
    Epoch:  4 iteration 200 loss: 3.5071861743927
    Epoch 4 Training loss 2.8919197189025825
    Epoch:  5 iteration 0 loss: 3.0071966648101807
    Epoch:  5 iteration 100 loss: 2.6622238159179688
    Epoch:  5 iteration 200 loss: 3.464808225631714
    Epoch 5 Training loss 2.832557945455863
    Evaluation loss 3.2545772727449775
    Epoch:  6 iteration 0 loss: 2.967473268508911
    Epoch:  6 iteration 100 loss: 2.586355209350586
    Epoch:  6 iteration 200 loss: 3.467402696609497
    Epoch 6 Training loss 2.7854216275948933
    Epoch:  7 iteration 0 loss: 2.922556161880493
    Epoch:  7 iteration 100 loss: 2.5442593097686768
    Epoch:  7 iteration 200 loss: 3.402819871902466
    Epoch 7 Training loss 2.7393553376979582
    Epoch:  8 iteration 0 loss: 2.8680827617645264
    Epoch:  8 iteration 100 loss: 2.4990341663360596
    Epoch:  8 iteration 200 loss: 3.363720178604126
    Epoch 8 Training loss 2.6976078317344734
    Epoch:  9 iteration 0 loss: 2.7911880016326904
    Epoch:  9 iteration 100 loss: 2.4367892742156982
    Epoch:  9 iteration 200 loss: 3.3128461837768555
    Epoch 9 Training loss 2.655838535325863
    Epoch:  10 iteration 0 loss: 2.760638475418091
    Epoch:  10 iteration 100 loss: 2.388662338256836
    Epoch:  10 iteration 200 loss: 3.299316167831421
    Epoch 10 Training loss 2.6183036396412334
    Evaluation loss 3.179426688570673
    Epoch:  11 iteration 0 loss: 2.7541329860687256
    Epoch:  11 iteration 100 loss: 2.3711133003234863
    Epoch:  11 iteration 200 loss: 3.2783377170562744
    Epoch 11 Training loss 2.5806847991577992
    Epoch:  12 iteration 0 loss: 2.672988176345825
    Epoch:  12 iteration 100 loss: 2.376006841659546
    Epoch:  12 iteration 200 loss: 3.1972506046295166
    Epoch 12 Training loss 2.5446970471612693
    Epoch:  13 iteration 0 loss: 2.6494789123535156
    Epoch:  13 iteration 100 loss: 2.3170242309570312
    Epoch:  13 iteration 200 loss: 3.1941475868225098
    Epoch 13 Training loss 2.5119990739174747
    Epoch:  14 iteration 0 loss: 2.5805208683013916
    Epoch:  14 iteration 100 loss: 2.287121057510376
    Epoch:  14 iteration 200 loss: 3.15193247795105
    Epoch 14 Training loss 2.479404618952507
    Epoch:  15 iteration 0 loss: 2.5561468601226807
    Epoch:  15 iteration 100 loss: 2.263378858566284
    Epoch:  15 iteration 200 loss: 3.183692216873169
    Epoch 15 Training loss 2.4484731512219886
    Evaluation loss 3.1426560713748
    Epoch:  16 iteration 0 loss: 2.553135871887207
    Epoch:  16 iteration 100 loss: 2.2017245292663574
    Epoch:  16 iteration 200 loss: 3.1033968925476074
    Epoch 16 Training loss 2.422065194773223
    Epoch:  17 iteration 0 loss: 2.5503063201904297
    Epoch:  17 iteration 100 loss: 2.1875879764556885
    Epoch:  17 iteration 200 loss: 3.0571794509887695
    Epoch 17 Training loss 2.392596175684612
    Epoch:  18 iteration 0 loss: 2.447784900665283
    Epoch:  18 iteration 100 loss: 2.146362781524658
    Epoch:  18 iteration 200 loss: 3.064692974090576
    Epoch 18 Training loss 2.3654149344515334
    Epoch:  19 iteration 0 loss: 2.4578680992126465
    Epoch:  19 iteration 100 loss: 2.1460280418395996
    Epoch:  19 iteration 200 loss: 3.024839162826538
    Epoch 19 Training loss 2.3424499056425168
    Epoch:  20 iteration 0 loss: 2.4384076595306396
    Epoch:  20 iteration 100 loss: 2.0974316596984863
    Epoch:  20 iteration 200 loss: 2.9965004920959473
    Epoch 20 Training loss 2.3167023499073878
    Evaluation loss 3.1197055689269915
    Epoch:  21 iteration 0 loss: 2.3817431926727295
    Epoch:  21 iteration 100 loss: 2.0880067348480225
    Epoch:  21 iteration 200 loss: 2.9751596450805664
    Epoch 21 Training loss 2.290719437303847
    Epoch:  22 iteration 0 loss: 2.3944735527038574
    Epoch:  22 iteration 100 loss: 2.0802524089813232
    Epoch:  22 iteration 200 loss: 2.9455509185791016
    Epoch 22 Training loss 2.2698037450677613
    Epoch:  23 iteration 0 loss: 2.3046939373016357
    Epoch:  23 iteration 100 loss: 2.068814992904663
    Epoch:  23 iteration 200 loss: 2.9671618938446045
    Epoch 23 Training loss 2.2478544365587227
    Epoch:  24 iteration 0 loss: 2.2910232543945312
    Epoch:  24 iteration 100 loss: 2.0361578464508057
    Epoch:  24 iteration 200 loss: 2.912736177444458
    Epoch 24 Training loss 2.2235630649205875
    Epoch:  25 iteration 0 loss: 2.335442304611206
    Epoch:  25 iteration 100 loss: 2.0128493309020996
    Epoch:  25 iteration 200 loss: 2.902696132659912
    Epoch 25 Training loss 2.2045435398182813
    Evaluation loss 3.1087384036663863
    Epoch:  26 iteration 0 loss: 2.257906913757324
    Epoch:  26 iteration 100 loss: 1.9572561979293823
    Epoch:  26 iteration 200 loss: 2.8583080768585205
    Epoch 26 Training loss 2.1859489336062077
    Epoch:  27 iteration 0 loss: 2.240891933441162
    Epoch:  27 iteration 100 loss: 1.9300264120101929
    Epoch:  27 iteration 200 loss: 2.8508572578430176
    Epoch 27 Training loss 2.1693027983038515
    Epoch:  28 iteration 0 loss: 2.199796199798584
    Epoch:  28 iteration 100 loss: 1.9422686100006104
    Epoch:  28 iteration 200 loss: 2.842454195022583
    Epoch 28 Training loss 2.1484814160984214
    Epoch:  29 iteration 0 loss: 2.1854031085968018
    Epoch:  29 iteration 100 loss: 1.9529454708099365
    Epoch:  29 iteration 200 loss: 2.848923444747925
    Epoch 29 Training loss 2.129414516738762
    Epoch:  30 iteration 0 loss: 2.1895618438720703
    Epoch:  30 iteration 100 loss: 1.871588110923767
    Epoch:  30 iteration 200 loss: 2.791942834854126
    Epoch 30 Training loss 2.113142051178803
    Evaluation loss 3.1089972194763527
    Epoch:  31 iteration 0 loss: 2.183242082595825
    Epoch:  31 iteration 100 loss: 1.8810741901397705
    Epoch:  31 iteration 200 loss: 2.779383897781372
    Epoch 31 Training loss 2.095987657767845
    Epoch:  32 iteration 0 loss: 2.0996744632720947
    Epoch:  32 iteration 100 loss: 1.8364850282669067
    Epoch:  32 iteration 200 loss: 2.7766530513763428
    Epoch 32 Training loss 2.077641033989847
    Epoch:  33 iteration 0 loss: 2.1275956630706787
    Epoch:  33 iteration 100 loss: 1.8858064413070679
    Epoch:  33 iteration 200 loss: 2.7581260204315186
    Epoch 33 Training loss 2.060825001092984
    Epoch:  34 iteration 0 loss: 2.0973703861236572
    Epoch:  34 iteration 100 loss: 1.851388692855835
    Epoch:  34 iteration 200 loss: 2.7524964809417725
    Epoch 34 Training loss 2.0462104783610435
    Epoch:  35 iteration 0 loss: 2.086354970932007
    Epoch:  35 iteration 100 loss: 1.8358268737792969
    Epoch:  35 iteration 200 loss: 2.731438398361206
    Epoch 35 Training loss 2.0299077402768404
    Evaluation loss 3.1139209169721624
    Epoch:  36 iteration 0 loss: 2.0591766834259033
    Epoch:  36 iteration 100 loss: 1.831368088722229
    Epoch:  36 iteration 200 loss: 2.6570539474487305
    Epoch 36 Training loss 2.014671925172371
    Epoch:  37 iteration 0 loss: 2.035496234893799
    Epoch:  37 iteration 100 loss: 1.8156630992889404
    Epoch:  37 iteration 200 loss: 2.700183391571045
    Epoch 37 Training loss 2.00206255805924
    Epoch:  38 iteration 0 loss: 2.036298990249634
    Epoch:  38 iteration 100 loss: 1.7919279336929321
    Epoch:  38 iteration 200 loss: 2.638498306274414
    Epoch 38 Training loss 1.983478224500046
    Epoch:  39 iteration 0 loss: 2.0249581336975098
    Epoch:  39 iteration 100 loss: 1.7389947175979614
    Epoch:  39 iteration 200 loss: 2.7169861793518066
    Epoch 39 Training loss 1.9724427386659686
    Epoch:  40 iteration 0 loss: 2.0175204277038574
    Epoch:  40 iteration 100 loss: 1.7219321727752686
    Epoch:  40 iteration 200 loss: 2.6475744247436523
    Epoch 40 Training loss 1.9562676721658385
    Evaluation loss 3.1181668797161364
    Epoch:  41 iteration 0 loss: 2.006847620010376
    Epoch:  41 iteration 100 loss: 1.7191071510314941
    Epoch:  41 iteration 200 loss: 2.6677799224853516
    Epoch 41 Training loss 1.9437097878349063
    Epoch:  42 iteration 0 loss: 1.9333022832870483
    Epoch:  42 iteration 100 loss: 1.7141562700271606
    Epoch:  42 iteration 200 loss: 2.5984952449798584
    Epoch 42 Training loss 1.9283085355908671
    Epoch:  43 iteration 0 loss: 1.9463298320770264
    Epoch:  43 iteration 100 loss: 1.717552900314331
    Epoch:  43 iteration 200 loss: 2.612987518310547
    Epoch 43 Training loss 1.9148052832706421
    Epoch:  44 iteration 0 loss: 1.9681422710418701
    Epoch:  44 iteration 100 loss: 1.7166101932525635
    Epoch:  44 iteration 200 loss: 2.593944549560547
    Epoch 44 Training loss 1.9044130284488674
    Epoch:  45 iteration 0 loss: 1.9368000030517578
    Epoch:  45 iteration 100 loss: 1.658645749092102
    Epoch:  45 iteration 200 loss: 2.593125581741333
    Epoch 45 Training loss 1.8893168467190844
    Evaluation loss 3.1277268276045214
    Epoch:  46 iteration 0 loss: 1.8545007705688477
    Epoch:  46 iteration 100 loss: 1.6403976678848267
    Epoch:  46 iteration 200 loss: 2.5595622062683105
    Epoch 46 Training loss 1.8757247360021512
    Epoch:  47 iteration 0 loss: 1.883792519569397
    Epoch:  47 iteration 100 loss: 1.6655203104019165
    Epoch:  47 iteration 200 loss: 2.551154851913452
    Epoch 47 Training loss 1.868178638252467
    Epoch:  48 iteration 0 loss: 1.8451733589172363
    Epoch:  48 iteration 100 loss: 1.6777702569961548
    Epoch:  48 iteration 200 loss: 2.501884937286377
    Epoch 48 Training loss 1.8518471154006044
    Epoch:  49 iteration 0 loss: 1.8499925136566162
    Epoch:  49 iteration 100 loss: 1.6486607789993286
    Epoch:  49 iteration 200 loss: 2.524087429046631
    Epoch 49 Training loss 1.8454946782718415
    Epoch:  50 iteration 0 loss: 1.856377363204956
    Epoch:  50 iteration 100 loss: 1.6574885845184326
    Epoch:  50 iteration 200 loss: 2.501849412918091
    Epoch 50 Training loss 1.8342453327073283
    Evaluation loss 3.1381525688403076
    Epoch:  51 iteration 0 loss: 1.8513492345809937
    Epoch:  51 iteration 100 loss: 1.6156225204467773
    Epoch:  51 iteration 200 loss: 2.546480178833008
    Epoch 51 Training loss 1.8206363293651437
    Epoch:  52 iteration 0 loss: 1.826798915863037
    Epoch:  52 iteration 100 loss: 1.5861092805862427
    Epoch:  52 iteration 200 loss: 2.486717462539673
    Epoch 52 Training loss 1.8091440575272268
    Epoch:  53 iteration 0 loss: 1.7943329811096191
    Epoch:  53 iteration 100 loss: 1.599743366241455
    Epoch:  53 iteration 200 loss: 2.4579596519470215
    Epoch 53 Training loss 1.7989700911108664
    Epoch:  54 iteration 0 loss: 1.7656499147415161
    Epoch:  54 iteration 100 loss: 1.5951091051101685
    Epoch:  54 iteration 200 loss: 2.4595048427581787
    Epoch 54 Training loss 1.7877836588768
    Epoch:  55 iteration 0 loss: 1.7756575345993042
    Epoch:  55 iteration 100 loss: 1.5770317316055298
    Epoch:  55 iteration 200 loss: 2.4162347316741943
    Epoch 55 Training loss 1.7794164511320347
    Evaluation loss 3.1487013315196815
    Epoch:  56 iteration 0 loss: 1.754793643951416
    Epoch:  56 iteration 100 loss: 1.546436071395874
    Epoch:  56 iteration 200 loss: 2.4273550510406494
    Epoch 56 Training loss 1.7669288957699174
    Epoch:  57 iteration 0 loss: 1.7600376605987549
    Epoch:  57 iteration 100 loss: 1.4999576807022095
    Epoch:  57 iteration 200 loss: 2.439790725708008
    Epoch 57 Training loss 1.7579986667589775
    Epoch:  58 iteration 0 loss: 1.7710247039794922
    Epoch:  58 iteration 100 loss: 1.5441653728485107
    Epoch:  58 iteration 200 loss: 2.411104202270508
    Epoch 58 Training loss 1.749948290134124
    Epoch:  59 iteration 0 loss: 1.7791287899017334
    Epoch:  59 iteration 100 loss: 1.5441499948501587
    Epoch:  59 iteration 200 loss: 2.4272119998931885
    Epoch 59 Training loss 1.7376091327428274
    Epoch:  60 iteration 0 loss: 1.7641197443008423
    Epoch:  60 iteration 100 loss: 1.505827784538269
    Epoch:  60 iteration 200 loss: 2.4162049293518066
    Epoch 60 Training loss 1.729162069608205
    Evaluation loss 3.1680270844662357
    Epoch:  61 iteration 0 loss: 1.719151258468628
    Epoch:  61 iteration 100 loss: 1.500209927558899
    Epoch:  61 iteration 200 loss: 2.4351766109466553
    Epoch 61 Training loss 1.7190746620618302
    Epoch:  62 iteration 0 loss: 1.7070326805114746
    Epoch:  62 iteration 100 loss: 1.50221848487854
    Epoch:  62 iteration 200 loss: 2.399951457977295
    Epoch 62 Training loss 1.707298602424269
    Epoch:  63 iteration 0 loss: 1.6960980892181396
    Epoch:  63 iteration 100 loss: 1.4736263751983643
    Epoch:  63 iteration 200 loss: 2.3375589847564697
    Epoch 63 Training loss 1.7027722406700785
    Epoch:  64 iteration 0 loss: 1.6605229377746582
    Epoch:  64 iteration 100 loss: 1.496120572090149
    Epoch:  64 iteration 200 loss: 2.377760887145996
    Epoch 64 Training loss 1.6901847218926664
    Epoch:  65 iteration 0 loss: 1.7002284526824951
    Epoch:  65 iteration 100 loss: 1.463133454322815
    Epoch:  65 iteration 200 loss: 2.377936601638794
    Epoch 65 Training loss 1.6831096865487802
    Evaluation loss 3.177895229637778
    Epoch:  66 iteration 0 loss: 1.6268677711486816
    Epoch:  66 iteration 100 loss: 1.5310866832733154
    Epoch:  66 iteration 200 loss: 2.3395535945892334
    Epoch 66 Training loss 1.6750581275368728
    Epoch:  67 iteration 0 loss: 1.683242678642273
    Epoch:  67 iteration 100 loss: 1.4536606073379517
    Epoch:  67 iteration 200 loss: 2.33609938621521
    Epoch 67 Training loss 1.6638375889732597
    Epoch:  68 iteration 0 loss: 1.6539921760559082
    Epoch:  68 iteration 100 loss: 1.4477794170379639
    Epoch:  68 iteration 200 loss: 2.3414015769958496
    Epoch 68 Training loss 1.6606883198725237
    Epoch:  69 iteration 0 loss: 1.6292625665664673
    Epoch:  69 iteration 100 loss: 1.404828667640686
    Epoch:  69 iteration 200 loss: 2.321927547454834
    Epoch 69 Training loss 1.6506938973182488
    Epoch:  70 iteration 0 loss: 1.6185498237609863
    Epoch:  70 iteration 100 loss: 1.4216632843017578
    Epoch:  70 iteration 200 loss: 2.3253204822540283
    Epoch 70 Training loss 1.6387621088477575
    Evaluation loss 3.1902488400655886
    Epoch:  71 iteration 0 loss: 1.6030402183532715
    Epoch:  71 iteration 100 loss: 1.4137858152389526
    Epoch:  71 iteration 200 loss: 2.3256776332855225
    Epoch 71 Training loss 1.6318460844078808
    Epoch:  72 iteration 0 loss: 1.6068423986434937
    Epoch:  72 iteration 100 loss: 1.4504164457321167
    Epoch:  72 iteration 200 loss: 2.3437039852142334
    Epoch 72 Training loss 1.6246998589395558
    Epoch:  73 iteration 0 loss: 1.5764877796173096
    Epoch:  73 iteration 100 loss: 1.3730628490447998
    Epoch:  73 iteration 200 loss: 2.264051675796509
    Epoch 73 Training loss 1.6186856142415567
    Epoch:  74 iteration 0 loss: 1.5833429098129272
    Epoch:  74 iteration 100 loss: 1.381920576095581
    Epoch:  74 iteration 200 loss: 2.2876336574554443
    Epoch 74 Training loss 1.6106610198597258
    Epoch:  75 iteration 0 loss: 1.5880494117736816
    Epoch:  75 iteration 100 loss: 1.4044418334960938
    Epoch:  75 iteration 200 loss: 2.2574541568756104
    Epoch 75 Training loss 1.5998829403443475
    Evaluation loss 3.205575323503987
    Epoch:  76 iteration 0 loss: 1.5913504362106323
    Epoch:  76 iteration 100 loss: 1.3733941316604614
    Epoch:  76 iteration 200 loss: 2.273179292678833
    Epoch 76 Training loss 1.5944278182877876
    Epoch:  77 iteration 0 loss: 1.574967861175537
    Epoch:  77 iteration 100 loss: 1.4105134010314941
    Epoch:  77 iteration 200 loss: 2.260707139968872
    Epoch 77 Training loss 1.5890476528108952
    Epoch:  78 iteration 0 loss: 1.5877436399459839
    Epoch:  78 iteration 100 loss: 1.3723187446594238
    Epoch:  78 iteration 200 loss: 2.266782760620117
    Epoch 78 Training loss 1.580453802036902
    Epoch:  79 iteration 0 loss: 1.540144920349121
    Epoch:  79 iteration 100 loss: 1.370208978652954
    Epoch:  79 iteration 200 loss: 2.2479166984558105
    Epoch 79 Training loss 1.5723614631359557
    Epoch:  80 iteration 0 loss: 1.5240201950073242
    Epoch:  80 iteration 100 loss: 1.3667224645614624
    Epoch:  80 iteration 200 loss: 2.2798657417297363
    Epoch 80 Training loss 1.5671947631266923
    Evaluation loss 3.2182803124543784
    Epoch:  81 iteration 0 loss: 1.5349093675613403
    Epoch:  81 iteration 100 loss: 1.341757893562317
    Epoch:  81 iteration 200 loss: 2.2628333568573
    Epoch 81 Training loss 1.5582374857442876
    Epoch:  82 iteration 0 loss: 1.4877135753631592
    Epoch:  82 iteration 100 loss: 1.3469762802124023
    Epoch:  82 iteration 200 loss: 2.2514214515686035
    Epoch 82 Training loss 1.5549645483978292
    Epoch:  83 iteration 0 loss: 1.5119167566299438
    Epoch:  83 iteration 100 loss: 1.3386821746826172
    Epoch:  83 iteration 200 loss: 2.2184598445892334
    Epoch 83 Training loss 1.546844436348798
    Epoch:  84 iteration 0 loss: 1.4820687770843506
    Epoch:  84 iteration 100 loss: 1.3448508977890015
    Epoch:  84 iteration 200 loss: 2.199396848678589
    Epoch 84 Training loss 1.5380232074195026
    Epoch:  85 iteration 0 loss: 1.4752027988433838
    Epoch:  85 iteration 100 loss: 1.316656231880188
    Epoch:  85 iteration 200 loss: 2.228752374649048
    Epoch 85 Training loss 1.52975351648403
    Evaluation loss 3.2336413650535087
    Epoch:  86 iteration 0 loss: 1.499496340751648
    Epoch:  86 iteration 100 loss: 1.3332045078277588
    Epoch:  86 iteration 200 loss: 2.2489013671875
    Epoch 86 Training loss 1.5249615564712846
    Epoch:  87 iteration 0 loss: 1.50925874710083
    Epoch:  87 iteration 100 loss: 1.3083447217941284
    Epoch:  87 iteration 200 loss: 2.235308885574341
    Epoch 87 Training loss 1.5197892824018502
    Epoch:  88 iteration 0 loss: 1.4814422130584717
    Epoch:  88 iteration 100 loss: 1.3245668411254883
    Epoch:  88 iteration 200 loss: 2.193997859954834
    Epoch 88 Training loss 1.5135974575387956
    Epoch:  89 iteration 0 loss: 1.4810220003128052
    Epoch:  89 iteration 100 loss: 1.2921677827835083
    Epoch:  89 iteration 200 loss: 2.1645917892456055
    Epoch 89 Training loss 1.5075664417517958
    Epoch:  90 iteration 0 loss: 1.4697095155715942
    Epoch:  90 iteration 100 loss: 1.2751893997192383
    Epoch:  90 iteration 200 loss: 2.188906669616699
    Epoch 90 Training loss 1.5008888401218585
    Evaluation loss 3.2456318169295293
    Epoch:  91 iteration 0 loss: 1.4636540412902832
    Epoch:  91 iteration 100 loss: 1.3394463062286377
    Epoch:  91 iteration 200 loss: 2.192689895629883
    Epoch 91 Training loss 1.4943399774943313
    Epoch:  92 iteration 0 loss: 1.4552161693572998
    Epoch:  92 iteration 100 loss: 1.2322344779968262
    Epoch:  92 iteration 200 loss: 2.1635537147521973
    Epoch 92 Training loss 1.488440135669707
    Epoch:  93 iteration 0 loss: 1.4642064571380615
    Epoch:  93 iteration 100 loss: 1.2490650415420532
    Epoch:  93 iteration 200 loss: 2.137782573699951
    Epoch 93 Training loss 1.4828345331954083
    Epoch:  94 iteration 0 loss: 1.425548791885376
    Epoch:  94 iteration 100 loss: 1.2757179737091064
    Epoch:  94 iteration 200 loss: 2.1594502925872803
    Epoch 94 Training loss 1.47362902414513
    Epoch:  95 iteration 0 loss: 1.4208916425704956
    Epoch:  95 iteration 100 loss: 1.260089635848999
    Epoch:  95 iteration 200 loss: 2.1245341300964355
    Epoch 95 Training loss 1.468862286276855
    Evaluation loss 3.265405671529478
    Epoch:  96 iteration 0 loss: 1.413726568222046
    Epoch:  96 iteration 100 loss: 1.2730776071548462
    Epoch:  96 iteration 200 loss: 2.1034820079803467
    Epoch 96 Training loss 1.464572765902645
    Epoch:  97 iteration 0 loss: 1.3888133764266968
    Epoch:  97 iteration 100 loss: 1.29197096824646
    Epoch:  97 iteration 200 loss: 2.159865617752075
    Epoch 97 Training loss 1.4591572745032382
    Epoch:  98 iteration 0 loss: 1.3947553634643555
    Epoch:  98 iteration 100 loss: 1.271963119506836
    Epoch:  98 iteration 200 loss: 2.1502716541290283
    Epoch 98 Training loss 1.4532260618277022
    Epoch:  99 iteration 0 loss: 1.4218417406082153
    Epoch:  99 iteration 100 loss: 1.2315309047698975
    Epoch:  99 iteration 200 loss: 2.12766695022583
    Epoch 99 Training loss 1.4487215552807855
    

    6. 翻译

    def translate_dev(i):
        en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])  #原来的英文
        print(en_sent)
        cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])  #原来的中文
        print("".join(cn_sent))
    
        # 一条句子
        mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
        mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
        bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)  # shape:[1,1], [[2]]
        
        # y_lengths: [[2]], 一个句子
        translation, attn = model.translate(mb_x, mb_x_len, bos)  # [1, 10]
        # 映射成中文
        translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
        trans = []
        for word in translation:
            if word != "EOS":
                trans.append(word)
            else:
                break
        print("".join(trans))           #翻译后的中文
    
    # 导入训练好模型
    model.load_state_dict(torch.load('translate_model.pt', map_location=device))
    for i in range(100, 120):
        translate_dev(i)
        print()
    

    执行结果:(样本少,且训练时间短,效果不好)

    BOS you have nice skin . EOS
    BOS 你 的 皮 膚 真 好 。 EOS
    你只有一些蛋糕。
    
    BOS you 're UNK correct . EOS
    BOS 你 部 分 正 确 。 EOS
    你可以选择。
    
    BOS everyone admired his courage . EOS
    BOS 每 個 人 都 佩 服 他 的 勇 氣 。 EOS
    每個人都抨擊他的健康
    
    BOS what time is it ? EOS
    BOS 几 点 了 ? EOS
    那是什么?
    
    BOS i 'm free tonight . EOS
    BOS 我 今 晚 有 空 。 EOS
    我今晚有空。
    
    BOS here is your book . EOS
    BOS 這 是 你 的 書 。 EOS
    那是你的书。
    
    BOS they are at lunch . EOS
    BOS 他 们 在 吃 午 饭 。 EOS
    他們正在吃午飯。
    
    BOS this chair is UNK . EOS
    BOS 這 把 椅 子 很 UNK 。 EOS
    这本书非常兴奋。
    
    BOS it 's pretty heavy . EOS
    BOS 它 真 重 。 EOS
    它是最好的。
    
    BOS many attended his funeral . EOS
    BOS 很 多 人 都 参 加 了 他 的 葬 礼 。 EOS
    在这个男人正在看他。
    
    BOS training will be provided . EOS
    BOS 会 有 训 练 。 EOS
    努力停為下雪停。
    
    BOS someone is watching you . EOS
    BOS 有 人 在 看 著 你 。 EOS
    有很多就了。
    
    BOS i slapped his face . EOS
    BOS 我 摑 了 他 的 臉 。 EOS
    我认为我的狗。
    
    BOS i like UNK music . EOS
    BOS 我 喜 歡 流 行 音 樂 。 EOS
    我喜欢音乐。
    
    BOS tom had no children . EOS
    BOS T o m 沒 有 孩 子 。 EOS
    她的父親沒有聽盲。
    
    BOS please lock the door . EOS
    BOS 請 把 門 鎖 上 。 EOS
    請關上門。
    
    BOS tom has calmed down . EOS
    BOS 汤 姆 冷 静 下 来 了 。 EOS
    汤姆坐在机器。
    
    BOS please speak more loudly . EOS
    BOS 請 說 大 聲 一 點 兒 。 EOS
    請說話再說話。
    
    BOS keep next sunday free . EOS
    BOS 把 下 周 日 空 出 来 。 EOS
    星星期天下雨。
    
    BOS i made a mistake . EOS
    BOS 我 犯 了 一 個 錯 。 EOS
    我一直成為一個演員。
    

    7. Encoder Decoder模型(含Attention版本)

    7.1 Encoder

    Encoder模型的任务是把输入文字传入embedding层和GRU层,转换成一些hidden states作为后续的context vectors;

    class Encoder(nn.Module):
        def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
            super(Encoder, self).__init__()
            self.embed = nn.Embedding(vocab_size, embed_size)
            self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
            self.dropout = nn.Dropout(dropout)
            self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)
    
        def forward(self, x, lengths):
            sorted_len, sorted_idx = lengths.sort(0, descending=True)
            x_sorted = x[sorted_idx.long()]
            embedded = self.dropout(self.embed(x_sorted))
            
            packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
            packed_out, hid = self.rnn(packed_embedded)
            out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
            _, original_idx = sorted_idx.sort(0, descending=False)
            out = out[original_idx.long()].contiguous()
            hid = hid[:, original_idx.long()].contiguous()
            # hid: [2, batch_size, enc_hidden_size]
            
            hid = torch.cat([hid[-2], hid[-1]], dim=1) # 将最后一层的hid的双向拼接
            # hid: [batch_size, 2*enc_hidden_size]
            hid = torch.tanh(self.fc(hid)).unsqueeze(0)
            # hid: [1, batch_size, dec_hidden_size]
            # out: [batch_size, seq_len, 2*enc_hidden_size]
            return out, hid
    

    7.2 Luong Attention

    • 图中 (h_t) 是第t个step下GRU的输出,即output

    • (hat{h_s}) 是encoder后的context,用QKV模型来解释的话,query就是 (h_t),key是 (hat{h_s})

    • 根据context vectors 和 当前的输出hidden states,计算输出;

    class Attention(nn.Module):
        def __init__(self, enc_hidden_size, dec_hidden_size):
            # enc_hidden_size跟Encoder的一样
            super(Attention, self).__init__()
            self.enc_hidden_size = enc_hidden_size
            self.dec_hidden_size = dec_hidden_size
    
            self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size, bias=False)
            self.linear_out = nn.Linear(enc_hidden_size*2 + dec_hidden_size, dec_hidden_size)
            
        def forward(self, output, context, mask):
            # mask = batch_size, output_len, context_len     # mask在Decoder中创建好了
            # output: batch_size, output_len, dec_hidden_size,就是Decoder的output
            # context: batch_size, context_len, 2*enc_hidden_size,就是Encoder的output 
            # 这里Encoder网络是双向的,Decoder是单向的
        
            batch_size = output.size(0)
            output_len = output.size(1)
            input_len = context.size(1) # input_len = context_len
            
            # 通过decoder的hidden states加上encoder的hidden states来计算一个分数,用于计算权重
            # batch_size, context_len, dec_hidden_size
            # 第一步,公式里的Wa先与hs做点乘,把Encoder output的enc_hidden_size换成dec_hidden_size。
            # Q: W·context
            context_in = self.linear_in(context.view(batch_size*input_len, -1)).view(                
                                        batch_size, input_len, -1) 
            
            # Q·K
            # context_in.transpose(1,2): batch_size, dec_hidden_size, context_len 
            # output: batch_size, output_len, dec_hidden_size
            attn = torch.bmm(output, context_in.transpose(1,2)) 
            # batch_size, output_len, context_len
            # 第二步,ht与上一步结果点乘,得到score
    
            attn.data.masked_fill(mask, -1e6)
            # .masked_fill作用请看这个链接:https://blog.csdn.net/candy134834/article/details/84594754
            # mask的维度必须和attn维度相同,mask为1的位置对应attn的位置的值替换成-1e6,
            # mask为1的意义需要看Decoder函数里面的定义
    
            attn = F.softmax(attn, dim=2) 
            # batch_size, output_len, context_len
            # 这个dim=2到底是怎么softmax的看下下面单元格例子
            # 第三步,计算每一个encoder的hidden states对应的权重。
            
            # context: batch_size, context_len, 2*enc_hidden_size,
            context = torch.bmm(attn, context) 
            # batch_size, output_len, 2*enc_hidden_size
            # 第四步,得出context vector是一个对于encoder输出的hidden states的一个加权平均
            
            # output: batch_size, output_len, dec_hidden_size
            output = torch.cat((context, output), dim=2) 
            # output:batch_size, output_len, 2*enc_hidden_size+dec_hidden_size
            # 第五步,将context vector和 decoder的hidden states 串起来。
            
            output = output.view(batch_size*output_len, -1)
            # output.shape = (batch_size*output_len, 2*enc_hidden_size+dec_hidden_size)
            output = torch.tanh(self.linear_out(output)) 
            # output.shape=(batch_size*output_len, dec_hidden_size)
            output = output.view(batch_size, output_len, -1)
            # output.shape=(batch_size, output_len, dec_hidden_size)
            # attn.shape = batch_size, output_len, context_len
            return output, attn
    

    7.3 Decoder

    Decoder会根据已经翻译的句子内容和context vectors,来决定下一个输出的单词;

    class Decoder(nn.Module):
        def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
            super(Decoder, self).__init__()
            self.embed = nn.Embedding(vocab_size, embed_size)
            self.attention = Attention(enc_hidden_size, dec_hidden_size)
            self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
            self.out = nn.Linear(dec_hidden_size, vocab_size)
            self.dropout = nn.Dropout(dropout)
    
        def create_mask(self, x_len, y_len):
            # x_len 是一个batch中文句子的长度列表
            # y_len 是一个batch英文句子的长度列表
            # a mask of shape x_len * y_len
            device = x_len.device
            max_x_len = x_len.max()
            max_y_len = y_len.max()
            
            x_mask = torch.arange(max_x_len, device=device)[None, :] < x_len[:, None]
            # print(x_mask.shape) = (batch_size, output_len) # 中文句子的mask
            y_mask = torch.arange(max_y_len, device=device)[None, :] < y_len[:, None]
            # print(y_mask.shape) = (batch_size, context_len) # 英文句子的mask
            
            mask = ( ~ x_mask[:, :, None] * y_mask[:, None, :]).byte()
            # mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
            # 1-说明取反
            # x_mask[:, :, None] = (batch_size, output_len, 1)
            # y_mask[:, None, :] =  (batch_size, 1, context_len)
            # print(mask.shape) = (batch_size, output_len, context_len)
            # 注意这个例子的*相乘不是torch.bmm矩阵点乘,只是用到了广播机制而已。
            return mask
        
        def forward(self, encoder_out, x_lengths, y, y_lengths, hid):
            sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
            y_sorted = y[sorted_idx.long()]
            hid = hid[:, sorted_idx.long()]
            
            y_sorted = self.dropout(self.embed(y_sorted)) # batch_size, output_length, embed_size
    
            packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
            out, hid = self.rnn(packed_seq, hid)
            unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
            _, original_idx = sorted_idx.sort(0, descending=False)
            output_seq = unpacked[original_idx.long()].contiguous()
            hid = hid[:, original_idx.long()].contiguous()
    
            mask = self.create_mask(y_lengths, x_lengths) # 这里真是坑,第一个参数位置是中文句子的长度列表
    
            output, attn = self.attention(output_seq, encoder_out, mask) 
            # output.shape=(batch_size, output_len, dec_hidden_size)
            # attn.shape = batch_size, output_len, context_len
            
            # self.out = nn.Linear(dec_hidden_size, vocab_size)
            output = F.log_softmax(self.out(output), -1) # 计算最后的输出概率
            # output =(batch_size, output_len, vocab_size)
            # 最后一个vocab_size维度 log_softmax
            # hid.shape = (1, batch_size, dec_hidden_size)
            return output, hid, attn
    

    7.4 Seq2Seq

    最后我们构建Seq2Seq模型把encoder, attention, decoder串到一起

    class Seq2Seq(nn.Module):
        def __init__(self, encoder, decoder):
            super(Seq2Seq, self).__init__()
            self.encoder = encoder
            self.decoder = decoder
            
        def forward(self, x, x_lengths, y, y_lengths):
            encoder_out, hid = self.encoder(x, x_lengths)
            # print(hid.shape)=torch.Size([1, batch_size, dec_hidden_size])
            # print(out.shape)=torch.Size([batch_size, seq_len, 2*enc_hidden_size])
            output, hid, attn = self.decoder(encoder_out=encoder_out, 
                        x_lengths=x_lengths,
                        y=y,
                        y_lengths=y_lengths,
                        hid=hid)
            # output =(batch_size, output_len, vocab_size)
            # hid.shape = (1, batch_size, dec_hidden_size)
            # attn.shape = (batch_size, output_len, context_len)
            return output, attn
        
    
        def translate(self, x, x_lengths, y, max_length=100):
            encoder_out, hid = self.encoder(x, x_lengths)
            preds = []
            batch_size = x.shape[0]
            attns = []
            for i in range(max_length):
                output, hid, attn = self.decoder(encoder_out, 
                        x_lengths,
                        y,
                        torch.ones(batch_size).long().to(y.device),
                        hid)
                y = output.max(2)[1].view(batch_size, 1)
                preds.append(y)
                attns.append(attn)
                
            return torch.cat(preds, 1), torch.cat(attns, 1)
    

    8. 训练函数并调用上面的train函数

    dropout = 0.2
    embed_size = hidden_size = 100
    encoder = Encoder(vocab_size=en_total_words,
                        embed_size=embed_size,
                        enc_hidden_size=hidden_size,
                        dec_hidden_size=hidden_size,
                        dropout=dropout)
    decoder = Decoder(vocab_size=cn_total_words,
                        embed_size=embed_size,
                        enc_hidden_size=hidden_size,
                        dec_hidden_size=hidden_size,
                        dropout=dropout)
    model = Seq2Seq(encoder, decoder)
    model = model.to(device)
    loss_fn = LanguageModelCriterion().to(device)
    optimizer = torch.optim.Adam(model.parameters())
    
    train(model, train_data, num_epochs=100)
    
    /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:25: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at  /pytorch/aten/src/ATen/native/cuda/LegacyDefinitions.cpp:19.)
    Epoch 0 iteration 0 loss 8.077441215515137
    Epoch 0 iteration 100 loss 5.345982551574707
    Epoch 0 iteration 200 loss 4.56335973739624
    Epoch 0 Training loss 5.50921318013691
    Evaluation loss 5.080491080824646
    Epoch 1 iteration 0 loss 4.47300386428833
    Epoch 1 iteration 100 loss 4.909076690673828
    Epoch 1 iteration 200 loss 4.016790390014648
    Epoch 1 Training loss 4.876065829219002
    Epoch 2 iteration 0 loss 3.9774909019470215
    Epoch 2 iteration 100 loss 4.472506046295166
    Epoch 2 iteration 200 loss 3.612961530685425
    Epoch 2 Training loss 4.438564572733501
    Epoch 3 iteration 0 loss 3.582581043243408
    Epoch 3 iteration 100 loss 4.136115074157715
    Epoch 3 iteration 200 loss 3.3212907314300537
    Epoch 3 Training loss 4.112743628998822
    Epoch 4 iteration 0 loss 3.2368381023406982
    Epoch 4 iteration 100 loss 3.8409037590026855
    Epoch 4 iteration 200 loss 3.097996711730957
    Epoch 4 Training loss 3.8477170270406864
    Epoch 5 iteration 0 loss 3.0059776306152344
    Epoch 5 iteration 100 loss 3.6137866973876953
    Epoch 5 iteration 200 loss 2.8685319423675537
    Epoch 5 Training loss 3.62822900357212
    Evaluation loss 3.637867412334476
    Epoch 6 iteration 0 loss 2.742856979370117
    Epoch 6 iteration 100 loss 3.390110492706299
    Epoch 6 iteration 200 loss 2.6777687072753906
    Epoch 6 Training loss 3.438536527389024
    Epoch 7 iteration 0 loss 2.585566759109497
    Epoch 7 iteration 100 loss 3.237795352935791
    Epoch 7 iteration 200 loss 2.5204241275787354
    Epoch 7 Training loss 3.27654657979662
    Epoch 8 iteration 0 loss 2.4295897483825684
    Epoch 8 iteration 100 loss 3.1119232177734375
    Epoch 8 iteration 200 loss 2.3597609996795654
    Epoch 8 Training loss 3.134849904339776
    Epoch 9 iteration 0 loss 2.2652432918548584
    Epoch 9 iteration 100 loss 2.9519033432006836
    Epoch 9 iteration 200 loss 2.217094898223877
    Epoch 9 Training loss 3.0061632458155874
    Epoch 10 iteration 0 loss 2.1327497959136963
    Epoch 10 iteration 100 loss 2.851846694946289
    Epoch 10 iteration 200 loss 2.1458141803741455
    Epoch 10 Training loss 2.894426641793655
    Evaluation loss 3.166497583308483
    Epoch 11 iteration 0 loss 2.013716697692871
    Epoch 11 iteration 100 loss 2.7616653442382812
    Epoch 11 iteration 200 loss 2.0029869079589844
    Epoch 11 Training loss 2.791488365026667
    Epoch 12 iteration 0 loss 1.9475183486938477
    Epoch 12 iteration 100 loss 2.647017240524292
    Epoch 12 iteration 200 loss 1.909979224205017
    Epoch 12 Training loss 2.698569336892456
    Epoch 13 iteration 0 loss 1.823117733001709
    Epoch 13 iteration 100 loss 2.6043999195098877
    Epoch 13 iteration 200 loss 1.8382450342178345
    Epoch 13 Training loss 2.616960156850951
    Epoch 14 iteration 0 loss 1.7701350450515747
    Epoch 14 iteration 100 loss 2.528083086013794
    Epoch 14 iteration 200 loss 1.7523369789123535
    Epoch 14 Training loss 2.5364692366823496
    Epoch 15 iteration 0 loss 1.6475502252578735
    Epoch 15 iteration 100 loss 2.4581422805786133
    Epoch 15 iteration 200 loss 1.7099241018295288
    Epoch 15 Training loss 2.4666260303200516
    Evaluation loss 2.96595491125677
    Epoch 16 iteration 0 loss 1.5571707487106323
    Epoch 16 iteration 100 loss 2.3642022609710693
    Epoch 16 iteration 200 loss 1.6701610088348389
    Epoch 16 Training loss 2.3992404009048993
    Epoch 17 iteration 0 loss 1.5091164112091064
    Epoch 17 iteration 100 loss 2.3246700763702393
    Epoch 17 iteration 200 loss 1.5856270790100098
    Epoch 17 Training loss 2.3398954671301877
    Epoch 18 iteration 0 loss 1.4500510692596436
    Epoch 18 iteration 100 loss 2.3111109733581543
    Epoch 18 iteration 200 loss 1.5008033514022827
    Epoch 18 Training loss 2.2817300454663068
    Epoch 19 iteration 0 loss 1.3648465871810913
    Epoch 19 iteration 100 loss 2.2263357639312744
    Epoch 19 iteration 200 loss 1.434478521347046
    Epoch 19 Training loss 2.2250880660919448
    Epoch 20 iteration 0 loss 1.29836106300354
    Epoch 20 iteration 100 loss 2.170522928237915
    Epoch 20 iteration 200 loss 1.413167119026184
    Epoch 20 Training loss 2.174868286439991
    Evaluation loss 2.862008639379293
    Epoch 21 iteration 0 loss 1.2679147720336914
    Epoch 21 iteration 100 loss 2.1024975776672363
    Epoch 21 iteration 200 loss 1.3479344844818115
    Epoch 21 Training loss 2.124773566655596
    Epoch 22 iteration 0 loss 1.2715562582015991
    Epoch 22 iteration 100 loss 2.0454132556915283
    Epoch 22 iteration 200 loss 1.2550404071807861
    Epoch 22 Training loss 2.0813773049198834
    Epoch 23 iteration 0 loss 1.204933762550354
    Epoch 23 iteration 100 loss 1.986390471458435
    Epoch 23 iteration 200 loss 1.3080803155899048
    Epoch 23 Training loss 2.035502688247159
    Epoch 24 iteration 0 loss 1.1525975465774536
    Epoch 24 iteration 100 loss 2.010538101196289
    Epoch 24 iteration 200 loss 1.2282871007919312
    Epoch 24 Training loss 1.9932144449453215
    Epoch 25 iteration 0 loss 1.1036208868026733
    Epoch 25 iteration 100 loss 1.9166961908340454
    Epoch 25 iteration 200 loss 1.1343692541122437
    Epoch 25 Training loss 1.9600739742604965
    Evaluation loss 2.8176820923223045
    Epoch 26 iteration 0 loss 1.126081109046936
    Epoch 26 iteration 100 loss 1.8861745595932007
    Epoch 26 iteration 200 loss 1.1452618837356567
    Epoch 26 Training loss 1.9179931864284319
    Epoch 27 iteration 0 loss 1.0936931371688843
    Epoch 27 iteration 100 loss 1.8307372331619263
    Epoch 27 iteration 200 loss 1.1571146249771118
    Epoch 27 Training loss 1.8831396913691085
    Epoch 28 iteration 0 loss 1.0479011535644531
    Epoch 28 iteration 100 loss 1.8134833574295044
    Epoch 28 iteration 200 loss 1.1056196689605713
    Epoch 28 Training loss 1.8484488868290145
    Epoch 29 iteration 0 loss 1.0205118656158447
    Epoch 29 iteration 100 loss 1.821661353111267
    Epoch 29 iteration 200 loss 1.0737680196762085
    Epoch 29 Training loss 1.8186136229030332
    Epoch 30 iteration 0 loss 0.9615429043769836
    Epoch 30 iteration 100 loss 1.7652055025100708
    Epoch 30 iteration 200 loss 0.9891017079353333
    Epoch 30 Training loss 1.7838154237577641
    Evaluation loss 2.791978492601989
    Epoch 31 iteration 0 loss 0.9656916856765747
    Epoch 31 iteration 100 loss 1.7245019674301147
    Epoch 31 iteration 200 loss 1.0227261781692505
    Epoch 31 Training loss 1.7579890261914233
    Epoch 32 iteration 0 loss 0.950885534286499
    Epoch 32 iteration 100 loss 1.7047593593597412
    Epoch 32 iteration 200 loss 1.0126252174377441
    Epoch 32 Training loss 1.7265817618896626
    Epoch 33 iteration 0 loss 0.9383729696273804
    Epoch 33 iteration 100 loss 1.7073816061019897
    Epoch 33 iteration 200 loss 0.9319257736206055
    Epoch 33 Training loss 1.701657226905382
    Epoch 34 iteration 0 loss 0.8925782442092896
    Epoch 34 iteration 100 loss 1.6764633655548096
    Epoch 34 iteration 200 loss 0.9110333323478699
    Epoch 34 Training loss 1.6714374329267176
    Epoch 35 iteration 0 loss 0.9124199748039246
    Epoch 35 iteration 100 loss 1.5932414531707764
    Epoch 35 iteration 200 loss 0.9045222997665405
    Epoch 35 Training loss 1.6459569074645013
    Evaluation loss 2.7976669954047697
    Epoch 36 iteration 0 loss 0.8820086121559143
    Epoch 36 iteration 100 loss 1.5867435932159424
    Epoch 36 iteration 200 loss 0.88615483045578
    Epoch 36 Training loss 1.6248752288905044
    Epoch 37 iteration 0 loss 0.8861231803894043
    Epoch 37 iteration 100 loss 1.540147304534912
    Epoch 37 iteration 200 loss 0.8625170588493347
    Epoch 37 Training loss 1.6025891727084938
    Epoch 38 iteration 0 loss 0.8272038698196411
    Epoch 38 iteration 100 loss 1.5469865798950195
    Epoch 38 iteration 200 loss 0.8701044321060181
    Epoch 38 Training loss 1.5775597927062583
    Epoch 39 iteration 0 loss 0.7841694951057434
    Epoch 39 iteration 100 loss 1.587996244430542
    Epoch 39 iteration 200 loss 0.8621845245361328
    Epoch 39 Training loss 1.5550835649611023
    Epoch 40 iteration 0 loss 0.7730535268783569
    Epoch 40 iteration 100 loss 1.510125756263733
    Epoch 40 iteration 200 loss 0.8023701906204224
    Epoch 40 Training loss 1.536449474043806
    Evaluation loss 2.794806465695927
    Epoch 41 iteration 0 loss 0.8037686347961426
    Epoch 41 iteration 100 loss 1.4897831678390503
    Epoch 41 iteration 200 loss 0.791727602481842
    Epoch 41 Training loss 1.5090646408452422
    Epoch 42 iteration 0 loss 0.7824649214744568
    Epoch 42 iteration 100 loss 1.4806140661239624
    Epoch 42 iteration 200 loss 0.7969489693641663
    Epoch 42 Training loss 1.4928973876534222
    Epoch 43 iteration 0 loss 0.7667363286018372
    Epoch 43 iteration 100 loss 1.4101524353027344
    Epoch 43 iteration 200 loss 0.7620548009872437
    Epoch 43 Training loss 1.4743025649328945
    Epoch 44 iteration 0 loss 0.7359268069267273
    Epoch 44 iteration 100 loss 1.3919748067855835
    Epoch 44 iteration 200 loss 0.8053562045097351
    Epoch 44 Training loss 1.4554574874191657
    Epoch 45 iteration 0 loss 0.7237775921821594
    Epoch 45 iteration 100 loss 1.3988888263702393
    Epoch 45 iteration 200 loss 0.7393531203269958
    Epoch 45 Training loss 1.4322836776244472
    Evaluation loss 2.812571211478882
    Epoch 46 iteration 0 loss 0.6948044300079346
    Epoch 46 iteration 100 loss 1.304335594177246
    Epoch 46 iteration 200 loss 0.689096987247467
    Epoch 46 Training loss 1.4196053662905366
    Epoch 47 iteration 0 loss 0.6662931442260742
    Epoch 47 iteration 100 loss 1.3609318733215332
    Epoch 47 iteration 200 loss 0.7002820372581482
    Epoch 47 Training loss 1.4011935120614474
    Epoch 48 iteration 0 loss 0.753171443939209
    Epoch 48 iteration 100 loss 1.290736436843872
    Epoch 48 iteration 200 loss 0.6648774147033691
    Epoch 48 Training loss 1.3849073988196539
    Epoch 49 iteration 0 loss 0.7202473878860474
    Epoch 49 iteration 100 loss 1.3155896663665771
    Epoch 49 iteration 200 loss 0.7304859757423401
    Epoch 49 Training loss 1.3667800886861978
    Epoch 50 iteration 0 loss 0.6739968061447144
    Epoch 50 iteration 100 loss 1.3187365531921387
    Epoch 50 iteration 200 loss 0.6818186044692993
    Epoch 50 Training loss 1.3522975228605367
    Evaluation loss 2.8305587463367226
    Epoch 51 iteration 0 loss 0.7073860168457031
    Epoch 51 iteration 100 loss 1.3020031452178955
    Epoch 51 iteration 200 loss 0.6439692974090576
    Epoch 51 Training loss 1.3355847990987002
    Epoch 52 iteration 0 loss 0.7059903144836426
    Epoch 52 iteration 100 loss 1.3240293264389038
    Epoch 52 iteration 200 loss 0.6690763831138611
    Epoch 52 Training loss 1.3210225660783441
    Epoch 53 iteration 0 loss 0.6332668662071228
    Epoch 53 iteration 100 loss 1.2513703107833862
    Epoch 53 iteration 200 loss 0.6558292508125305
    Epoch 53 Training loss 1.3107876620531327
    Epoch 54 iteration 0 loss 0.6457605957984924
    Epoch 54 iteration 100 loss 1.246716856956482
    Epoch 54 iteration 200 loss 0.6521980166435242
    Epoch 54 Training loss 1.2941160204924305
    Epoch 55 iteration 0 loss 0.6227668523788452
    Epoch 55 iteration 100 loss 1.2278225421905518
    Epoch 55 iteration 200 loss 0.6727674007415771
    Epoch 55 Training loss 1.2778384867442392
    Evaluation loss 2.853066331010339
    Epoch 56 iteration 0 loss 0.5656446814537048
    Epoch 56 iteration 100 loss 1.2470365762710571
    Epoch 56 iteration 200 loss 0.6154574751853943
    Epoch 56 Training loss 1.2628238236702862
    Epoch 57 iteration 0 loss 0.5883901119232178
    Epoch 57 iteration 100 loss 1.220670461654663
    Epoch 57 iteration 200 loss 0.5693823099136353
    Epoch 57 Training loss 1.2493639340990528
    Epoch 58 iteration 0 loss 0.5862078666687012
    Epoch 58 iteration 100 loss 1.1798666715621948
    Epoch 58 iteration 200 loss 0.6039236187934875
    Epoch 58 Training loss 1.233422517480705
    Epoch 59 iteration 0 loss 0.5904982686042786
    Epoch 59 iteration 100 loss 1.1922262907028198
    Epoch 59 iteration 200 loss 0.5879594087600708
    Epoch 59 Training loss 1.2254928604160356
    Epoch 60 iteration 0 loss 0.5759232640266418
    Epoch 60 iteration 100 loss 1.153181791305542
    Epoch 60 iteration 200 loss 0.5618763566017151
    Epoch 60 Training loss 1.208009701754125
    Evaluation loss 2.871801325149645
    Epoch 61 iteration 0 loss 0.5813993215560913
    Epoch 61 iteration 100 loss 1.1644539833068848
    Epoch 61 iteration 200 loss 0.574725329875946
    Epoch 61 Training loss 1.1981734446603696
    Epoch 62 iteration 0 loss 0.54474276304245
    Epoch 62 iteration 100 loss 1.172760248184204
    Epoch 62 iteration 200 loss 0.5736648440361023
    Epoch 62 Training loss 1.1898703442169898
    Epoch 63 iteration 0 loss 0.5367869138717651
    Epoch 63 iteration 100 loss 1.1455975770950317
    Epoch 63 iteration 200 loss 0.5316013097763062
    Epoch 63 Training loss 1.17624104425602
    Epoch 64 iteration 0 loss 0.5965208411216736
    Epoch 64 iteration 100 loss 1.0865147113800049
    Epoch 64 iteration 200 loss 0.5165320634841919
    Epoch 64 Training loss 1.1626691673104586
    Epoch 65 iteration 0 loss 0.5757507085800171
    Epoch 65 iteration 100 loss 1.0935884714126587
    Epoch 65 iteration 200 loss 0.5055180191993713
    Epoch 65 Training loss 1.1486647791128823
    Evaluation loss 2.888705662914898
    Epoch 66 iteration 0 loss 0.554165244102478
    Epoch 66 iteration 100 loss 1.0687988996505737
    Epoch 66 iteration 200 loss 0.5742641687393188
    Epoch 66 Training loss 1.137105361580985
    Epoch 67 iteration 0 loss 0.5457087755203247
    Epoch 67 iteration 100 loss 1.0431346893310547
    Epoch 67 iteration 200 loss 0.5005226731300354
    Epoch 67 Training loss 1.1251085623172112
    Epoch 68 iteration 0 loss 0.5115629434585571
    Epoch 68 iteration 100 loss 1.0742378234863281
    Epoch 68 iteration 200 loss 0.4768718481063843
    Epoch 68 Training loss 1.1169700110112382
    Epoch 69 iteration 0 loss 0.5225317478179932
    Epoch 69 iteration 100 loss 1.041317343711853
    Epoch 69 iteration 200 loss 0.534132719039917
    Epoch 69 Training loss 1.1102069269037087
    Epoch 70 iteration 0 loss 0.48191702365875244
    Epoch 70 iteration 100 loss 1.0193127393722534
    Epoch 70 iteration 200 loss 0.4716692566871643
    Epoch 70 Training loss 1.0953487060532974
    Evaluation loss 2.9113613200675643
    Epoch 71 iteration 0 loss 0.59366375207901
    Epoch 71 iteration 100 loss 1.042155146598816
    Epoch 71 iteration 200 loss 0.45154234766960144
    Epoch 71 Training loss 1.091857606453407
    Epoch 72 iteration 0 loss 0.5238001346588135
    Epoch 72 iteration 100 loss 1.027955174446106
    Epoch 72 iteration 200 loss 0.5312687754631042
    Epoch 72 Training loss 1.0819147441571477
    Epoch 73 iteration 0 loss 0.5490065217018127
    Epoch 73 iteration 100 loss 1.0117655992507935
    Epoch 73 iteration 200 loss 0.5065831542015076
    Epoch 73 Training loss 1.0687738825424347
    Epoch 74 iteration 0 loss 0.5063045024871826
    Epoch 74 iteration 100 loss 1.0293574333190918
    Epoch 74 iteration 200 loss 0.5003397464752197
    Epoch 74 Training loss 1.0547682162543772
    Epoch 75 iteration 0 loss 0.45235222578048706
    Epoch 75 iteration 100 loss 1.0297720432281494
    Epoch 75 iteration 200 loss 0.4086465835571289
    Epoch 75 Training loss 1.0492441391159522
    Evaluation loss 2.945518095083358
    Epoch 76 iteration 0 loss 0.46895310282707214
    Epoch 76 iteration 100 loss 0.9821916818618774
    Epoch 76 iteration 200 loss 0.48269033432006836
    Epoch 76 Training loss 1.0391477853463758
    Epoch 77 iteration 0 loss 0.4749329388141632
    Epoch 77 iteration 100 loss 0.9370260238647461
    Epoch 77 iteration 200 loss 0.5174757242202759
    Epoch 77 Training loss 1.0302731247109642
    Epoch 78 iteration 0 loss 0.4239536225795746
    Epoch 78 iteration 100 loss 0.982223391532898
    Epoch 78 iteration 200 loss 0.46800896525382996
    Epoch 78 Training loss 1.02385489594265
    Epoch 79 iteration 0 loss 0.5065938830375671
    Epoch 79 iteration 100 loss 0.9628017544746399
    Epoch 79 iteration 200 loss 0.4790896773338318
    Epoch 79 Training loss 1.014064338724403
    Epoch 80 iteration 0 loss 0.43752557039260864
    Epoch 80 iteration 100 loss 0.8520130515098572
    Epoch 80 iteration 200 loss 0.40985599160194397
    Epoch 80 Training loss 1.002772340443797
    Evaluation loss 2.9621174652470703
    Epoch 81 iteration 0 loss 0.44454529881477356
    Epoch 81 iteration 100 loss 0.9402937293052673
    Epoch 81 iteration 200 loss 0.41907238960266113
    Epoch 81 Training loss 0.9969750344440632
    Epoch 82 iteration 0 loss 0.4125458896160126
    Epoch 82 iteration 100 loss 0.9050692915916443
    Epoch 82 iteration 200 loss 0.5123288035392761
    Epoch 82 Training loss 0.989270733289982
    Epoch 83 iteration 0 loss 0.4764525592327118
    Epoch 83 iteration 100 loss 0.9303292632102966
    Epoch 83 iteration 200 loss 0.44956347346305847
    Epoch 83 Training loss 0.9836232322264327
    Epoch 84 iteration 0 loss 0.48803961277008057
    Epoch 84 iteration 100 loss 0.9711679816246033
    Epoch 84 iteration 200 loss 0.44382917881011963
    Epoch 84 Training loss 0.9754019522005947
    Epoch 85 iteration 0 loss 0.46858376264572144
    Epoch 85 iteration 100 loss 0.9077855944633484
    Epoch 85 iteration 200 loss 0.4368401765823364
    Epoch 85 Training loss 0.9719701637084417
    Evaluation loss 2.990323471814928
    Epoch 86 iteration 0 loss 0.4658893346786499
    Epoch 86 iteration 100 loss 0.8741357326507568
    Epoch 86 iteration 200 loss 0.423090398311615
    Epoch 86 Training loss 0.9583479015194021
    Epoch 87 iteration 0 loss 0.4344865381717682
    Epoch 87 iteration 100 loss 0.8711681365966797
    Epoch 87 iteration 200 loss 0.41789063811302185
    Epoch 87 Training loss 0.9474942575734959
    Epoch 88 iteration 0 loss 0.42888087034225464
    Epoch 88 iteration 100 loss 0.8649926781654358
    Epoch 88 iteration 200 loss 0.4007169306278229
    Epoch 88 Training loss 0.9426996659812006
    Epoch 89 iteration 0 loss 0.4257383942604065
    Epoch 89 iteration 100 loss 0.8543802499771118
    Epoch 89 iteration 200 loss 0.41755053400993347
    Epoch 89 Training loss 0.9360784180891997
    Epoch 90 iteration 0 loss 0.44567570090293884
    Epoch 90 iteration 100 loss 0.8825702667236328
    Epoch 90 iteration 200 loss 0.41934728622436523
    Epoch 90 Training loss 0.9298315100552865
    Evaluation loss 3.0115221658685347
    Epoch 91 iteration 0 loss 0.4208157956600189
    Epoch 91 iteration 100 loss 0.813216507434845
    Epoch 91 iteration 200 loss 0.4040917158126831
    Epoch 91 Training loss 0.9193997417003693
    Epoch 92 iteration 0 loss 0.41099944710731506
    Epoch 92 iteration 100 loss 0.8445271253585815
    Epoch 92 iteration 200 loss 0.3656329810619354
    Epoch 92 Training loss 0.9176739377176427
    Epoch 93 iteration 0 loss 0.3757087290287018
    Epoch 93 iteration 100 loss 0.8153252601623535
    Epoch 93 iteration 200 loss 0.3429928421974182
    Epoch 93 Training loss 0.908510602970967
    Epoch 94 iteration 0 loss 0.42818954586982727
    Epoch 94 iteration 100 loss 0.8111163377761841
    Epoch 94 iteration 200 loss 0.4069685935974121
    Epoch 94 Training loss 0.902406791391548
    Epoch 95 iteration 0 loss 0.37496259808540344
    Epoch 95 iteration 100 loss 0.7711942195892334
    Epoch 95 iteration 200 loss 0.4711993336677551
    Epoch 95 Training loss 0.8950450409158558
    Evaluation loss 3.034074325896889
    Epoch 96 iteration 0 loss 0.3465866148471832
    Epoch 96 iteration 100 loss 0.7963153123855591
    Epoch 96 iteration 200 loss 0.34403669834136963
    Epoch 96 Training loss 0.8901747859619997
    Epoch 97 iteration 0 loss 0.40915727615356445
    Epoch 97 iteration 100 loss 0.8184841275215149
    Epoch 97 iteration 200 loss 0.39140430092811584
    Epoch 97 Training loss 0.883020128311112
    Epoch 98 iteration 0 loss 0.35649484395980835
    Epoch 98 iteration 100 loss 0.858453094959259
    Epoch 98 iteration 200 loss 0.3666226267814636
    Epoch 98 Training loss 0.8780363934074935
    Epoch 99 iteration 0 loss 0.41814950108528137
    Epoch 99 iteration 100 loss 0.8482405543327332
    Epoch 99 iteration 200 loss 0.3461854159832001
    Epoch 99 Training loss 0.8755297044370204
    

    9. 调用上面的translate_dev函数

    for i in range(100,120):
        translate_dev(i)
        print()
    
    BOS you have nice skin . EOS
    BOS 你 的 皮 膚 真 好 。 EOS
    /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:33: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at  /pytorch/aten/src/ATen/native/cuda/LegacyDefinitions.cpp:19.)
    你最好有很多新鲜事。
    
    BOS you 're UNK correct . EOS
    BOS 你 部 分 正 确 。 EOS
    你的生身。
    
    BOS everyone admired his courage . EOS
    BOS 每 個 人 都 佩 服 他 的 勇 氣 。 EOS
    每個人都認釋了他的意見。
    
    BOS what time is it ? EOS
    BOS 几 点 了 ? EOS
    多少钱?
    
    BOS i 'm free tonight . EOS
    BOS 我 今 晚 有 空 。 EOS
    我今晚有空。
    
    BOS here is your book . EOS
    BOS 這 是 你 的 書 。 EOS
    你的書在這裡。
    
    BOS they are at lunch . EOS
    BOS 他 们 在 吃 午 饭 。 EOS
    他们午吃午饭。
    
    BOS this chair is UNK . EOS
    BOS 這 把 椅 子 很 UNK 。 EOS
    这里的发生是门。
    
    BOS it 's pretty heavy . EOS
    BOS 它 真 重 。 EOS
    它是居机场的。
    
    BOS many attended his funeral . EOS
    BOS 很 多 人 都 参 加 了 他 的 葬 礼 。 EOS
    每个人都知道他的音樂。
    
    BOS training will be provided . EOS
    BOS 会 有 训 练 。 EOS
    即待有空光。
    
    BOS someone is watching you . EOS
    BOS 有 人 在 看 著 你 。 EOS
    有人在看你。
    
    BOS i slapped his face . EOS
    BOS 我 摑 了 他 的 臉 。 EOS
    我愛他打斷了。
    
    BOS i like UNK music . EOS
    BOS 我 喜 歡 流 行 音 樂 。 EOS
    我喜欢阅读。
    
    BOS tom had no children . EOS
    BOS T o m 沒 有 孩 子 。 EOS
    汤姆没有孩子。
    
    BOS please lock the door . EOS
    BOS 請 把 門 鎖 上 。 EOS
    請關門門。
    
    BOS tom has calmed down . EOS
    BOS 汤 姆 冷 静 下 来 了 。 EOS
    Tom有三個走。
    
    BOS please speak more loudly . EOS
    BOS 請 說 大 聲 一 點 兒 。 EOS
    請講更多的聲外。
    
    BOS keep next sunday free . EOS
    BOS 把 下 周 日 空 出 来 。 EOS
    下個星期一下吧。
    
    BOS i made a mistake . EOS
    BOS 我 犯 了 一 個 錯 。 EOS
    我错了錯誤。
    
  • 相关阅读:
    第二周作业修改+
    第三周作业
    第二周作业修改
    第三次作业
    第二次作业
    获奖感想
    最后的作业
    14周作业
    第七周作业
    第六周作业
  • 原文地址:https://www.cnblogs.com/douzujun/p/13624567.html
Copyright © 2020-2023  润新知