• NLP(十五):word2vec+ESIM进行文本相似度计算


    一、准备数据集

    modelsesim_modelmy_dataset.py

    import torch.utils.data as data
    
    
    class MyDataset(data.Dataset):
        def __init__(self, texta, textb, label):
            self.texta = texta
            self.textb = textb
            self.label = label
    
        def __getitem__(self, item):
            texta = self.texta[item]
            textb = self.textb[item]
            label = self.label[item]
            return texta, textb, label
        def __len__(self):
            return len(self.texta)

    二、用word2vec代替Embedding 

    modelsesim_modelmy_word2vec.py

    from gensim.models.fasttext import FastText
    import torch
    import numpy as np
    import os
    
    class WordEmbedding(object):
        def __init__(self):
            parent_path = os.path.split(os.path.realpath(__file__))[0]
            self.root = parent_path[:parent_path.find("models")]  # E:personassemantics
            self.word_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "word_fasttext.model")
            self.char_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "char_fasttext.model")
            self.model = FastText.load(self.char_fasttext)
    
        def sentenceTupleToEmbedding(self, data1, data2):
            aCutListMaxLen = max([len(list(str(sentence_a))) for sentence_a in data1])
            bCutListMaxLen = max([len(list(str(sentence_a))) for sentence_a in data2])
            maxLen = max(aCutListMaxLen,bCutListMaxLen)
            seq_len = maxLen
            a = self.sqence_vec(data1, seq_len) #batch_size, sqence, embedding
            b = self.sqence_vec(data2, seq_len)
            return torch.FloatTensor(a), torch.FloatTensor(b)
        def sqence_vec(self, data, seq_len):
            data_a_vec = []
            for sequence_a in data:
                sequence_vec = []  # sequence * 128
                for word_a in list(str(sequence_a)):
                    if word_a in self.model.wv:
                        sequence_vec.append(self.model.wv[word_a])
                sequence_vec = np.array(sequence_vec)
                add = np.zeros((seq_len - sequence_vec.shape[0], 128))
                sequenceVec = np.vstack((sequence_vec, add))
                data_a_vec.append(sequenceVec)
            a_vec = np.array(data_a_vec)
            return a_vec

    三、模型

    modelsesim_modelmy_esim.py

    import torch.nn as nn
    import torch.nn.functional as F
    import torch
    
    class ESIM(nn.Module):
        def __init__(self):
            super(ESIM, self).__init__()
            self.dropout = 0.5
            self.hidden_size = 128
            self.embeds_dim = 128
    
            self.bn_embeds = nn.BatchNorm1d(self.embeds_dim)
            self.lstm1 = nn.LSTM(self.embeds_dim, self.hidden_size, batch_first=True, bidirectional=True)
            self.lstm2 = nn.LSTM(self.hidden_size * 8, self.hidden_size, batch_first=True, bidirectional=True)
    
            self.fc = nn.Sequential(
                nn.BatchNorm1d(self.hidden_size * 8),
                nn.Linear(self.hidden_size * 8, 2),
                nn.ELU(inplace=True),
                nn.BatchNorm1d(2),
                nn.Dropout(self.dropout),
                nn.Linear(2, 2),
                nn.ELU(inplace=True),
                nn.BatchNorm1d(2),
                nn.Dropout(self.dropout),
                nn.Linear(2, 1),
            )
    
        def soft_attention_align(self, x1, x2):
            '''
            x1: batch_size * seq_len * dim
            x2: batch_size * seq_len * dim
            '''
            # attention: batch_size * seq_len * seq_len
            attention = torch.matmul(x1, x2.transpose(1, 2))
            # mask1 = mask1.float().masked_fill_(mask1, float('-inf'))
            # mask2 = mask2.float().masked_fill_(mask2, float('-inf'))
    
            # weight: batch_size * seq_len * seq_len
            # weight1 = F.softmax(attention + mask2.unsqueeze(1), dim=-1)
            weight1 = F.softmax(attention, dim=-1)
            x1_align = torch.matmul(weight1, x2)
            # weight2 = F.softmax(attention.transpose(1, 2) + mask1.unsqueeze(1), dim=-1)
            weight2 = F.softmax(attention.transpose(1, 2), dim=-1)
            x2_align = torch.matmul(weight2, x1)
            # x_align: batch_size * seq_len * hidden_size
    
            return x1_align, x2_align
    
        def submul(self, x1, x2):
            mul = x1 * x2
            sub = x1 - x2
            return torch.cat([sub, mul], -1)
    
        def apply_multiple(self, x):
            # input: batch_size * seq_len * (2 * hidden_size)
            p1 = F.avg_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1)
            p2 = F.max_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1)
            # output: batch_size * (4 * hidden_size)
            return torch.cat([p1, p2], 1)
    
        def forward(self, x1, x2):
            # batch_size * seq_len
    
            # embeds: batch_size * seq_len => batch_size * seq_len * dim
            # x1 = self.bn_embeds(self.embeds(sent1).transpose(1, 2).contiguous()).transpose(1, 2)
            # x2 = self.bn_embeds(self.embeds(sent2).transpose(1, 2).contiguous()).transpose(1, 2)
    
            # batch_size * seq_len * dim =>      batch_size * seq_len * hidden_size
            o1, _ = self.lstm1(x1)
            o2, _ = self.lstm1(x2)
    
            # Attention
            # batch_size * seq_len * hidden_size
            q1_align, q2_align = self.soft_attention_align(o1, o2)
    
            # Compose
            # batch_size * seq_len * (8 * hidden_size)
            q1_combined = torch.cat([o1, q1_align, self.submul(o1, q1_align)], -1)
            q2_combined = torch.cat([o2, q2_align, self.submul(o2, q2_align)], -1)
    
            # batch_size * seq_len * (2 * hidden_size)
            q1_compose, _ = self.lstm2(q1_combined)
            q2_compose, _ = self.lstm2(q2_combined)
    
            # Aggregate
            # input: batch_size * seq_len * (2 * hidden_size)
            # output: batch_size * (4 * hidden_size)
            q1_rep = self.apply_multiple(q1_compose)
            q2_rep = self.apply_multiple(q2_compose)
    
            # Classifier
            x = torch.cat([q1_rep, q2_rep], -1)
            similarity = self.fc(x)
            return similarity

    四、运行模型

    import torch
    import os
    from torch.utils.data import DataLoader
    from my_dataset import MyDataset
    import pandas as pd
    import numpy as np
    from my_esim import ESIM
    import torch.nn as nn
    from my_word2vec import WordEmbedding
    
    class RunESIM():
        def __init__(self):
            self.learning_rate = 0.001
            self.device = torch.device("cuda")
            parent_path = os.path.split(os.path.realpath(__file__))[0]
            self.root = parent_path[:parent_path.find("models")]  # E:personassemantics
            self.train_path = os.path.join(self.root, "datas", "bert_data", "sim_data", "train.csv")
            self.val_path = os.path.join(self.root, "datas", "bert_data", "sim_data", "val.csv")
            self.test_path = os.path.join(self.root, "datas", "bert_data", "sim_data", "test.csv")
            self.batch_size =64
            self.epoch = 50
            self.criterion = nn.BCEWithLogitsLoss().to(self.device)
            self.word = WordEmbedding()
            self.check_point = os.path.join(self.root, "checkpoints", "char_bilstm", "char_bilstm.pth")
    
        def get_loader(self, path):
            data = pd.read_csv(path, sep="	")
            d1, d2, y = data["s1"], data["s2"], list(data["y"])
            dataset = MyDataset(d1, d2, torch.LongTensor(y))
            data_iter = DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=True)
            return data_iter
    
        def binary_acc(self, preds, y):
            preds = torch.round(torch.sigmoid(preds))
            correct = torch.eq(preds, y).float()
            acc = correct.sum() / len(correct)
            return acc
    
        def train(self, mynet, train_iter, optimizer, criterion, epoch, device):
            avg_acc = []
            avg_loss = []
            mynet.train()
            for batch_id, (data1, data2, label) in enumerate(train_iter):
                try:
                    a, b = self.word.sentenceTupleToEmbedding(data1, data2)
                except Exception as e:
                    print("错误")
                a, b, label = a.to(device), b.to(device), label.to(device)
                distence = mynet(a, b)
                distence = distence.squeeze(1)
                loss = criterion(distence, label.float())
    
                acc = self.binary_acc(distence, label.float()).item()
                avg_acc.append(acc)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if batch_id % 100 == 0:
                    print("轮数:", epoch, "batch: ", batch_id, "训练损失:", loss.item(), "准确率:", acc)
                avg_loss.append(loss.item())
            avg_acc = np.array(avg_acc).mean()
            avg_loss = np.array(avg_loss).mean()
            print('train acc:', avg_acc)
            print("train loss", avg_loss)
    
        def eval(self, mynet, test_iter, criteon, epoch, device):
            mynet.eval()
            avg_acc = []
            avg_loss = []
            with torch.no_grad():
                for batch_id, (data1, data2, label) in enumerate(test_iter):
                    try:
                        a, b = self.word.sentenceTupleToEmbedding(data1, data2)
                    except Exception as e:
                        continue
    
                    a, b, label = a.to(device), b.to(device), label.to(device)
                    distence = mynet(a, b)
                    distence = distence.squeeze(1)
                    loss = criteon(distence, label.float())
                    acc = self.binary_acc(distence, label.float()).item()
                    avg_acc.append(acc)
                    avg_loss.append(loss.item())
                    if batch_id>50:
                        break
            avg_acc = np.array(avg_acc).mean()
            avg_loss = np.array(avg_loss).mean()
            print('>>test acc:', avg_acc)
            print(">>test loss:", avg_loss)
            return (avg_acc, avg_loss)
    
        def run_train(self):
            model = ESIM().to(self.device)
            max_acc = 0
            train_iter = self.get_loader(self.train_path)
            val_iter = self.get_loader(self.val_path)
            optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
    
            for epoch in range(self.epoch):
                self.train(model, train_iter, optimizer, self.criterion, epoch, self.device)
                eval_acc, eval_loss = self.eval(model, val_iter, self.criterion, epoch, self.device)
                if eval_acc > max_acc:
                    print("save model")
                    torch.save(model.state_dict(), self.check_point)
                    max_acc = eval_acc
    
    if __name__ == '__main__':
        RunESIM().run_train()

    五、实验结果

    轮数: 30 batch:  0 训练损失: 0.30833131074905396 准确率: 0.875
    轮数: 30 batch:  100 训练损失: 0.15703552961349487 准确率: 0.953125
    轮数: 30 batch:  200 训练损失: 0.25020748376846313 准确率: 0.90625
    轮数: 30 batch:  300 训练损失: 0.2216322124004364 准确率: 0.90625
    轮数: 30 batch:  400 训练损失: 0.21571914851665497 准确率: 0.921875
    轮数: 30 batch:  500 训练损失: 0.23061133921146393 准确率: 0.890625
    轮数: 30 batch:  600 训练损失: 0.2357763797044754 准确率: 0.90625
    轮数: 30 batch:  700 训练损失: 0.180502250790596 准确率: 0.9375
    轮数: 30 batch:  800 训练损失: 0.3004327118396759 准确率: 0.875
    轮数: 30 batch:  900 训练损失: 0.22875544428825378 准确率: 0.90625
    轮数: 30 batch:  1000 训练损失: 0.21407470107078552 准确率: 0.921875
    轮数: 30 batch:  1100 训练损失: 0.20641490817070007 准确率: 0.921875
    轮数: 30 batch:  1200 训练损失: 0.2836620509624481 准确率: 0.875
    train acc: 0.8965875
    train loss 0.2476300214469433
    >>test acc: 0.9613281264901161
    >>test loss: 0.10271382739301771
  • 相关阅读:
    数据库的......
    数据库
    XML
    网络编程
    I/O系统---流
    周结

    集合,框架
    Spring入门
    Java Wed
  • 原文地址:https://www.cnblogs.com/zhangxianrong/p/14773769.html
Copyright © 2020-2023  润新知