• 2.使用RNN做诗歌生成


    诗歌生成比分类问题要稍微麻烦一些,而且第一次使用RNN做文本方面的问题,还是有很多概念性的东西~~~

    数据下载:

    链接:https://pan.baidu.com/s/1uCDup7U5rGuIlIb-lnZgjQ
    提取码:f436

    data.py——数据处理

     1 import numpy as np
     2 import os
     3 
     4 def get_data(conf):
     5     '''
     6     生成数据
     7     :param conf: 配置选项,Config对象
     8     :return: word2ix: 每个字符对应的索引id,如u'月'->100
     9     :return: ix2word: 每个字符对应的索引id,如100->u'月'
    10     :return: data: 每一行是一首诗对应的字的索引id
    11     '''
    12     if os.path.exists(conf.pickle_path):
    13 
    14         datas = np.load(conf.pickle_path) #np数据文件
    15         data = datas['data']
    16         ix2word = datas['ix2word'].item()
    17         word2ix = datas['word2ix'].item()
    18         return data, word2ix, ix2word
    View Code

    config.py——配置文件

     1 class Config(object):
     2     """Base configuration class.For custom configurations, create a
     3     sub-class that inherits from this one and override  properties that
     4     need to changed
     5     """
     6 
     7     # 模型保存路径前缀(几个epoch后保存)
     8     model_prefix = 'checkpoints/tang'
     9 
    10     # 模型保存路径
    11     model_path = 'checkpoints/tang.pth'
    12 
    13     # start words
    14     start_words = '春江花月夜'
    15 
    16     # 生成诗歌的类型,默认为藏头诗
    17     p_type = 'acrostic'
    18 
    19     # 训练次数
    20     max_epech = 200
    21 
    22     # 数据存放的路径
    23     pickle_path = 'data/tang.npz'
    24 
    25     # mini批大小
    26     batch_size =128###128
    27 
    28     # dataloader加载数据使用多少进程
    29     num_workers = 4
    30 
    31     # LSTM的层数
    32     num_layers = 2
    33 
    34     # 词向量维数
    35     embedding_dim = 128
    36 
    37     # LSTM隐藏层维度
    38     hidden_dim = 256
    39 
    40     # 多少个epoch保存一次模型权重和诗
    41     save_every = 10
    42 
    43     # 训练是生成诗的保存路径
    44     out_path = 'out'
    45 
    46     # 测试生成诗的保存路径
    47     out_poetry_path = 'out/poetry.txt'
    48 
    49     # 生成诗的最大长度
    50     max_gen_len = 200
    51     use_gpu=True
    View Code

    model.py——模型

     1 import torch.nn as nn
     2 import torch
     3 class PoetryModel(nn.Module):
     4     def __init__(self, vocab_size, conf, device):
     5         super(PoetryModel, self).__init__()
     6         self.num_layers = conf.num_layers
     7         self.hidden_dim = conf.hidden_dim
     8         self.device = device
     9         # 定义词向量层
    10         self.embeddings = nn.Embedding(vocab_size, conf.embedding_dim)#(词库个数,词向量维度)
    11         # 定义2层的LSTM,并且batch位于函数参数的第一位
    12         self.lstm = nn.LSTM(conf.embedding_dim, conf.hidden_dim, num_layers=self.num_layers)
    13         # 定义全连接层,后接一个softmax进行分类
    14         self.linear_out = nn.Linear(conf.hidden_dim, vocab_size)
    15 
    16     def forward(self, input, hidden=None):
    17         '''
    18         :param input: (seq,batch)
    19         :return: 模型的结果
    20         '''
    21         seq_len, batch_size = input.size()
    22         embeds = self.embeddings(input) # embeds_size:(seq_len,batch_size,embedding_dim)
    23         if hidden is None:
    24             h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
    25             c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
    26         else:
    27             h0, c0 = hidden
    28         output, hidden = self.lstm(embeds, (h0, c0))#(seq_len,batch_size,隐藏层维度)
    29 
    30 
    31         output = self.linear_out(output.view(seq_len * batch_size, -1)) # output_size:(seq_len*batch_size,vocab_size)
    32         return output, hidden
    View Code

    train.py——训练模型

      1 import torch
      2 from torch import nn
      3 from torch.autograd import Variable
      4 from torch.optim import Adam
      5 from torch.utils.data import DataLoader
      6 
      7 from data import get_data
      8 from model import PoetryModel
      9 from config import Config
     10 device=torch.device('cuda:0')
     11 conf = Config()
     12 
     13 def generate(model, start_words, ix2word, word2ix, prefix_words=None):
     14     '''
     15     给定几个词,根据这几个词接着生成一首完整的诗歌
     16     '''
     17     print(start_words)
     18     results = list(start_words)
     19     start_word_len = len(start_words)
     20     # 手动设置第一个词为<START>
     21     # 这个地方有问题,最后需要再看一下
     22     input = Variable(torch.Tensor([word2ix['<START>']]).view(1, 1).long())
     23     if conf.use_gpu: input = input.cuda()
     24     hidden = None
     25 
     26     if prefix_words:
     27         for word in prefix_words:
     28             output, hidden = model(input, hidden)
     29             # 下边这句话是为了把input变成1*1?
     30             input = Variable(input.data.new([word2ix[word]])).view(1, 1)
     31     for i in range(conf.max_gen_len):
     32         output, hidden = model(input, hidden)#input只有一个词,对应的是'<START>'的序号
     33 
     34 
     35         if i < start_word_len:
     36             w = results[i]
     37             input = Variable(input.data.new([word2ix[w]])).view(1, 1)
     38         else:
     39             top_index = output.cpu().data.topk(1)[1][0].numpy()[0]
     40 
     41             w = ix2word[top_index]
     42             results.append(w)
     43             input = Variable(input.data.new([top_index])).view(1, 1)
     44         if w == '<EOP>':
     45             del results[-1]  # -1的意思是倒数第一个
     46             break
     47     return results
     48 
     49 def gen_acrostic(model, start_words, ix2word, word2ix, prefix_words=None):
     50     '''
     51     生成藏头诗
     52     start_words : u'深度学习'
     53     生成:
     54     深木通中岳,青苔半日脂。
     55     度山分地险,逆浪到南巴。
     56     学道兵犹毒,当时燕不移。
     57     习根通古岸,开镜出清羸。
     58     '''
     59     results = []
     60     start_word_len = len(start_words)
     61     input = Variable(torch.Tensor([word2ix['<START>']]).view(1, 1).long())
     62     if conf.use_gpu: input = input.cuda()
     63     hidden = None
     64 
     65     index = 0  # 用来指示已经生成了多少句藏头诗
     66     # 上一个词
     67     pre_word = '<START>'
     68 
     69     if prefix_words:
     70         for word in prefix_words:
     71             output, hidden = model(input, hidden)
     72             input = Variable(input.data.new([word2ix[word]])).view(1, 1)
     73 
     74     for i in range(conf.max_gen_len):
     75         output, hidden = model(input, hidden)
     76         top_index = output.data[0].topk(1)[1][0]
     77         w = ix2word[top_index]
     78 
     79         if (pre_word in {u'', u'', '<START>'}):
     80             # 如果遇到句号,藏头的词送进去生成
     81 
     82             if index == start_word_len:
     83                 # 如果生成的诗歌已经包含全部藏头的词,则结束
     84                 break
     85             else:
     86                 # 把藏头的词作为输入送入模型
     87                 w = start_words[index]
     88                 index += 1
     89                 input = Variable(input.data.new([word2ix[w]])).view(1, 1)
     90         else:
     91             # 否则的话,把上一次预测是词作为下一个词输入
     92             input = Variable(input.data.new([word2ix[w]])).view(1, 1)
     93         results.append(w)
     94         pre_word = w
     95     return results
     96 
     97 def train(**kwargs):
     98 
     99     for k, v in kwargs.items():
    100         setattr(conf, k, v)
    101     # 获取数据
    102     data, word2ix, ix2word = get_data(conf)
    103     # 生成dataload
    104     dataloader = DataLoader(dataset=data, batch_size=conf.batch_size,
    105                             shuffle=True,
    106                             drop_last=True,
    107                             num_workers=conf.num_workers)
    108     # 定义模型
    109     model = PoetryModel(len(word2ix), conf, device).to(device)
    110     # model.load_state_dict(torch.load(r'C:\Users\ocean\PycharmProjects\guesswhat_pytorch\checkpoints\tang_0.pth'))
    111     # fout = open('%s/p%d' % (conf.out_path, 1), 'w',encoding='utf-8')
    112     # # for word in list('春江花月夜'):
    113     # #     gen_poetry = generate(model, word, ix2word, word2ix)
    114     # #     fout.write("".join(gen_poetry) + "\n\n")
    115     # gen_poetry = generate(model, list("北邮真的号"), ix2word, word2ix)
    116     #
    117     # fout.write("".join(gen_poetry) + "\n\n")
    118     # fout.close()
    119     # torch.save(model.state_dict(), "%s_%d.pth" % (conf.model_prefix, 1))
    120 
    121 
    122 
    123     # 定义优化器
    124     optimizer = Adam(model.parameters())
    125     # 定义损失函数
    126     criterion = nn.CrossEntropyLoss()
    127     # 开始训练模型
    128     for epoch in range(conf.max_epech):
    129         epoch_loss = 0
    130         for i, data in enumerate(dataloader):
    131 
    132             data = data.long().transpose(1, 0).contiguous()#(sequence长度,batch_size)
    133 
    134             input, target = data[:-1, :], data[1:, :]
    135             input, target = input.to(device), target.to(device)
    136             optimizer.zero_grad()
    137             output, _ = model(input)
    138 
    139             loss = criterion(output, target.view(-1))
    140 
    141             loss.backward()
    142             optimizer.step()
    143             epoch_loss += loss.item()
    144         print("epoch_%d_loss:%0.4f" % (epoch, epoch_loss / conf.batch_size))
    145         if epoch % conf.save_every == 0:
    146             fout = open('%s/p%d' % (conf.out_path, epoch), 'w',encoding='utf-8')
    147             for word in list('春江花月夜'):
    148                 gen_poetry = generate(model, word, ix2word, word2ix)
    149                 fout.write("".join(gen_poetry) + "\n\n")
    150             fout.close()
    151             torch.save(model.state_dict(), "%s_%d.pth" % (conf.model_prefix, epoch))
    152 
    153 
    154 if __name__ == '__main__':
    155 
    156     train()
    View Code

    最终效果:

    春雨,君王背日暮花枝。桂花飘雨裛芙蓉,花蕚垂红绾芙蓉。上天高峨落不归,中有一枝春未老。一枝香蘂红妆结,春风吹花飘落萼。今朝今日凌风沙,今日还家花落早。东风吹落柳条生,柳色参差春水东。昨日风烟花满树,今日东风正如萍。杏园春色不自胜,青春忽倒春风来。春风不哢花枝落,况复春风花满枝。

    江上春未央,春光照四面。一日一千里,一朝一瞬息。不如塌然云,不见巢下树。一身一何讬,万事皆有敌。君子不敢横,君心若为役。呜呼两鬓苦,又如寒玉翦。不知何代费,所以心不殒。一朝得之愚,所以心所施。我亦我未领,我来亦未归。始知与君子,不觉身不饥。彭泽有余事,吾君何所为。何以为我人,於今有耆谁。

    花间一人家,十五日中见。一朝出门门,不见君子诺。车骑徒自媒,朱绂不能竞。拜军拜车骑,倏忽嫖姚羌。既无征镇愤,慷慨望乡国。一朝辞虏府,暮宿在蓟垒。君子傥封侯,今人在咸朔。英英在其间,日昃不敢作。云山互相见,魏阙空怀戚。何必在沛人,裴回眇眇。所念无穷,斯人不怠。。

    月白风来吹,君心不可攀。从来一字内,不觉一朝闲。未达身难弃,衰容事不闲。不忧讥孺子,不觉老农闲。寝食能供给,闲橙媿漉肱。酒阑湘口臥,窗拔峡添灯。静谭畬茶骇,遥闻夜笛闲。芦洲多雨霁,石火带霜蒸。酿酒眠新熟,扁舟醉自闲。夜渔疎竹坞,春水钓渔关。石笋穿云烧,江花带笋斑。此时多好客,不敢问山僧。

    夜夜拍人笑,春风弄酒丝。花开桃李岭,花落洞庭春。酒思同君醉,诗成是袜尘。自怜心已矣,何事梦何如。摈世才难易,伤心镜不如。脸如银烛薄,色映玉楼嚬。绣户雕筵软,鸳鸯拂枕春。相逢期洛下,梦想忆扬秦。玉匣调金鼎,金盘染髻巾。鷰人曾有什,山寺不相亲。鹤毳应相毒,蝇蚊爽有真。空余襟袖下,不觉世间人。

    参考博客:https://blog.csdn.net/jiangpeng59/article/details/81003058

  • 相关阅读:
    js 正则表达式 test match exec三个方法的异同
    网页使用MD5加密
    解决Google地图和字体api无法加载的问题(转)
    Javascript 的addEventListener()及attachEvent()区别分析
    get与post的区别
    清除浮动的几种方法
    zoom属性(IE私有属性)
    class,id和name的区别
    深夜偷精之反射函数
    jQuery和js区别
  • 原文地址:https://www.cnblogs.com/tangweijqxx/p/10608997.html
Copyright © 2020-2023  润新知