• textrnn用于问答中的意图识别


    一.利用textrnn进行文本分类,用于在问答中的意图识别。

    二.结构图

    三.程序(完整程序:https://github.com/jiangnanboy/movie_knowledge_graph_app/tree/master/intent_classification/pytorch/textrnn)

    import os
    import torch
    from torchtext import data,datasets
    from torchtext.data import Iterator, BucketIterator
    from torchtext.vocab import Vectors
    from torch import nn,optim
    import torch.nn.functional as F
    import pandas as pd
    import pickle
    
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    intent_classification_path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
    # 训练数据路径
    train_data = os.path.join(intent_classification_path,'classification_data/classification_data.csv')
    # 读取数据
    train_data = pd.read_csv(train_data)
    # 按字分    
    tokenize =lambda x: x.split(' ')
    
    TEXT = data.Field(
                        sequential=True,
                        tokenize=tokenize,
                        lower=True,
                        use_vocab=True,
                        pad_token='<pad>',
                        unk_token='<unk>',
                        batch_first=True,
                        fix_length=20)
    
    LABEL = data.Field(
                        sequential=False,
                        use_vocab=False)
    # 获取训练或测试数据集
    def get_dataset(csv_data, text_field, label_field, test=False):
        fields = [('id', None), ('text', text_field), ('label', label_field)]
        examples = []
        if test: #测试集,不加载label
            for text in csv_data['text']:
                examples.append(data.Example.fromlist([None, text, None], fields))
        else: # 训练集
            for text, label in zip(csv_data['text'], csv_data['label']):
                examples.append(data.Example.fromlist([None, text, label], fields))
        return examples, fields
    
    train_examples,train_fields = get_dataset(train_data, TEXT, LABEL)
    
    train = data.Dataset(train_examples, train_fields)
    # 预训练数据
    pretrained_embedding = os.path.join(os.getcwd(), 'sgns.sogou.char')
    vectors = Vectors(name=pretrained_embedding)
    # 构建词典
    TEXT.build_vocab(train, min_freq=1, vectors = vectors)
    
    words_path = os.path.join(os.getcwd(), 'words.pkl')
    with open(words_path, 'wb') as f_words:
        pickle.dump(TEXT.vocab, f_words)
        
    BATCH_SIZE = 163
    # 构建迭代器
    train_iter = BucketIterator(
                                dataset=train,
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                sort_within_batch=False)
    
    
    # 构建分类模型
    class TextRNN(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, output_size, dropout=0.5):
            super(TextRNN, self).__init__()
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
            # 这里batch_first=True,只影响输入和输出。hidden与cell还是batch在第2维
            self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, batch_first=True, dropout=dropout)
            self.fc = nn.Linear(hidden_size*2, output_size)
            
        def forward(self, x):
            # x :(batch, seq_len) = (163, 20)
            # [batch,seq_len,embedding_dim] -> (163, 20, 300)
            x = self.embedding(x) 
            #out=[batch_size, seq_len, hidden_size*2]
            #h=[num_layers*2, batch_size, hidden_size]
            #c=[num_layers*2, batch_size, hidden_size]
            out,(h, c)= self.lstm(x)
            # 最后时刻的hidden
            out = self.fc(out[:, -1, :])
            return out
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter(os.getcwd()+'/log', comment='textrnn')
    
    # 训练
    
    # 构建model
    model = TextRNN(len(TEXT.vocab), TEXT.vocab.vectors.shape[1], 128, 2, 16).to(DEVICE)
    # 利用预训练模型初始化embedding,requires_grad=True,可以fine-tune
    model.embedding.weight.data.copy_(TEXT.vocab.vectors)
    # 训练模式
    model.train()
    # 优化和损失
    # optimizer = torch.optim.Adam(model.parameters(),lr=0.1, weight_decay=0.1)
    optimizer = torch.optim.SGD(model.parameters(),lr=0.1, momentum=0.95, nesterov=True)
    criterion = nn.CrossEntropyLoss()
    
    with writer:
        for iter in range(600):
            for i, batch in enumerate(train_iter):
                train_text = batch.text
                train_label = batch.label
                train_text = train_text.to(DEVICE)
                train_label = train_label.to(DEVICE)
                out = model(train_text)
                loss = criterion(out, train_label)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if (iter+1) % 10 == 0:
                        print ('iter [{}/{}], Loss: {:.4f}'.format(iter+1, 300, loss.item()))
                #writer.add_graph(model, input_to_model=train_text,verbose=False)
                writer.add_scalar('loss',loss.item(),global_step=iter+1)
        writer.flush()
        writer.close()
                
    model_path = os.path.join(os.getcwd(), "model.h5")
    torch.save(model.state_dict(), model_path)

     
     
    iter [10/300], Loss: 2.7375
    iter [20/300], Loss: 2.7167
    iter [30/300], Loss: 2.7200
    iter [40/300], Loss: 2.7181
    iter [50/300], Loss: 2.7152
    iter [60/300], Loss: 2.7142
    iter [70/300], Loss: 2.7107
    iter [80/300], Loss: 2.7006
    iter [90/300], Loss: 2.6418
    iter [100/300], Loss: 2.3200
    iter [110/300], Loss: 2.1519
    iter [120/300], Loss: 2.0308
    iter [130/300], Loss: 2.3037
    iter [140/300], Loss: 2.1125
    iter [150/300], Loss: 1.9677
    iter [160/300], Loss: 1.8521
    iter [170/300], Loss: 1.8255
    iter [180/300], Loss: 1.7360
    iter [190/300], Loss: 1.7009
    iter [200/300], Loss: 1.5200
    iter [210/300], Loss: 2.2366
    iter [220/300], Loss: 1.3887
    iter [230/300], Loss: 1.2162
    iter [240/300], Loss: 1.0565
    iter [250/300], Loss: 0.8792
    iter [260/300], Loss: 0.7540
    iter [270/300], Loss: 0.6081
    iter [280/300], Loss: 0.5910
    iter [290/300], Loss: 0.4691
    iter [300/300], Loss: 0.3708
    iter [310/300], Loss: 0.3033
    iter [320/300], Loss: 0.2526
    iter [330/300], Loss: 0.2226
    iter [340/300], Loss: 0.2168
    iter [350/300], Loss: 0.1647
    iter [360/300], Loss: 0.1403
    iter [370/300], Loss: 0.1242
    iter [380/300], Loss: 0.0933
    iter [390/300], Loss: 0.0965
    iter [400/300], Loss: 0.0592
    iter [410/300], Loss: 0.1430
    iter [420/300], Loss: 0.0605
    iter [430/300], Loss: 0.0411
    iter [440/300], Loss: 0.0747
    iter [450/300], Loss: 0.0293
    iter [460/300], Loss: 0.0190
    iter [470/300], Loss: 0.0196
    iter [480/300], Loss: 0.0179
    iter [490/300], Loss: 0.0113
    iter [500/300], Loss: 0.0102
    iter [510/300], Loss: 0.0094
    iter [520/300], Loss: 0.0087
    iter [530/300], Loss: 0.0168
    iter [540/300], Loss: 0.0049
    iter [550/300], Loss: 0.0046
    iter [560/300], Loss: 0.0051
    iter [570/300], Loss: 0.0028
    iter [580/300], Loss: 0.0025
    iter [590/300], Loss: 0.0021
    iter [600/300], Loss: 0.0020
  • 相关阅读:
    关于方差所引发的遐想
    POJ 1390 Blocks
    POJ 1722 SUBTRACT
    BZOJ 1901 Dynamic Rankings
    关于Shine-hale
    ACM恢复训练(一)最短路
    CSP退役记
    校内模拟赛(三)(9.24)
    校内模拟赛(二)(9.12)
    校内模拟赛(一)(2019.9.10)
  • 原文地址:https://www.cnblogs.com/little-horse/p/14271649.html
Copyright © 2020-2023  润新知