• pytorch LSTM情感分类全部代码


    先运行main.py进行文本序列化,再train.py模型训练

    dataset.py

    from torch.utils.data import DataLoader,Dataset
    import torch
    import os
    from utils import tokenlize
    import config
    
    
    
    class ImdbDataset(Dataset):
        def __init__(self,train=True):
            super(ImdbDataset,self).__init__()
            data_path = r"H:73-nlp自然语言处理-v5.bt38[周大伟]73-nlp自然语言处理-v5.bt38[周大伟]第四天代码dataaclImdb_v1aclImdb"
            data_path += r"	rain" if train else r"	est"
            self.total_path = []
            for temp_path in [r"pos",r"
    eg"]:
                cur_path = data_path + temp_path
                self.total_path += [os.path.join(cur_path,i) for i in os.listdir(cur_path) if i.endswith(".txt")]
    
        def __getitem__(self, idx):
            file = self.total_path[idx]
            review = open(file,encoding="utf-8").read()
            review = tokenlize(review)
            label = int(file.split("_")[-1].split(".")[0])
            label = 0 if label < 5 else 1
            return review,label
    
        def __len__(self):
            return len(self.total_path)
    
    def collate_fn(batch):
        '''
        对batch数据进行处理
        :param batch: 
        :return: 
        '''
        reviews,labels = zip(*batch)
        reviews = torch.LongTensor([config.ws.transform(i,max_len=config.max_len) for i in reviews])
        labels = torch.LongTensor(labels)
        return reviews,labels
    
    
    def get_dataloader(train):
        imdbdataset = ImdbDataset(train=True)
        batch_size = config.train_batch_size if train else config.test_batch_size
        return DataLoader(imdbdataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn)
    
    
    if __name__ == '__main__':
        # dataset = ImdbDataset(train=True)
        # print(dataset[1])
        for idx,(review,label) in enumerate(get_dataloader(train=True)):
            print(review)
            print(label)
            break
    

      utils.py

    """
    实现额外的方法
    """
    import re
    
    def tokenlize(sentence):
        """
        进行文本分词
        :param sentence: str
        :return: [str,str,str]
        """
    
        fileters = ['!', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
                    '?', '@', '[', '\', ']', '^', '_', '`', '{', '|', '}', '~', '	', '
    ', 'x97', 'x96', '”', '“', ]
        sentence = sentence.lower() #把大写转化为小写
        sentence = re.sub("<br />"," ",sentence)
        # sentence = re.sub("I'm","I am",sentence)
        # sentence = re.sub("isn't","is not",sentence)
        sentence = re.sub("|".join(fileters)," ",sentence)
        result = [i for i in sentence.split(" ") if len(i)>0]
    
        return result
    

    word_sequence.py

    '''
    文本序列化
    '''
    
    class WordSequence():
        UNK_TAG = "<UNK>"
        PAD_TAG = "<PAD>"
        UNK = 1
        PAD = 0
    
        def __init__(self):
            self.dict = {
                self.UNK_TAG:self.UNK,
                self.PAD_TAG:self.PAD
            }
            self.count = {}
    
    
        def fit(self,sentence):
            '''
            统计词频
            :param sentence: 
            :return: 
            '''
            for word in sentence:
                self.count[word] = self.count.get(word,0)+1
    
        def build_vocab(self,min_count=0,max_count = None,max_features = None):
            """
            根据条件构建 词典
            :param min_count:最小词频 
            :param max_count: 最大词频
            :param max_features: 最大词语数
            :return: 
            """
            if min_count is not None:
                self.count = {word:count for word,count in self.count.items() if count >min_count}
            if max_count is not None:
                self.count = {word:count for word,count in self.count.items() if count<max_count}
            if max_features is not None:
                #排序
                self.count = dict(sorted(self.count.items(),lambda x:x[-1],reverse=True)[:max_features])
    
            for word in self.count:
                self.dict[word] = len(self.dict) #每次word对应一个数字
    
            #把dict进行翻转
                self.inverse_dict = dict(zip(self.dict.values(),self.dict.keys()))
    
        def transform(self,sentence,max_len =None):
            '''
            把句子转化为数字序列
            :param sentence: 
            :return: 
            '''
            if len(sentence) > max_len:
                sentence = sentence[:max_len]
            else:
                sentence = sentence + [self.PAD_TAG]*(max_len-len(sentence))
            return [self.dict.get(i,1) for i in sentence]
    
        def inverse_transform(self,incides):
            """
            把数字序列转化为字符
            :param incides: 
            :return: 
            """
            return [self.inverse_dict.get(i,"<UNK>") for i in incides]
    
        def __len__(self):
            return len(self.dict)
    
    if __name__ == '__main__':
        sentences = [["今天","天气","很","好"],
                     ["今天","去","吃","什么"]]
    
        ws = WordSequence()
        for sentence in sentences:
            ws.fit(sentence)
    
        ws.build_vocab(min_count=0)
        print(ws.dict)
        ret = ws.transform(["好","热","呀","呀","呀","呀","呀","呀","呀"],max_len=5)
        print(ret)
        ret = ws.inverse_transform(ret)
        print(ret)
    

      main.py

    from word_sequence import WordSequence
    from dataset import get_dataloader
    import pickle
    from tqdm import tqdm
    
    if __name__ == '__main__':
        ws = WordSequence()
        train_data = get_dataloader(True)
        test_data = get_dataloader(False)
        for reviews,labels in tqdm(train_data,total=len(train_data)):
            for review in reviews:
                ws.fit(review)
        for reviews,labels in tqdm(test_data,total=len(test_data)):
            for review in reviews:
                ws.fit(review)
        print("正在建立...")
        ws.build_vocab()
        print(len(ws))
        pickle.dump(ws,open("./models/ws.pkl","wb"))
    

      model.py

    """
    构建模型
    """
    import torch.nn as nn
    import config
    import torch.nn.functional as F
    
    class ImdbModel(nn.Module):
        def __init__(self):
            super(ImdbModel,self).__init__()
            self.embedding = nn.Embedding(num_embeddings=len(config.ws),embedding_dim=300,padding_idx=config.ws.PAD)
            self.fc = nn.Linear(config.max_len*300,2)
    
        def forward(self,input):
            '''
            :param input: 
            :return: 
            '''
            input_embeded = self.embedding(input)
    
            input_embeded_viewed = input_embeded.view(input_embeded.size(0),-1)
    
            out = self.fc(input_embeded_viewed)
            return  F.log_softmax(out,dim=-1)
    

      LSTMmodel.py

    """
    构建模型
    """
    import torch.nn as nn
    import torch
    import config
    import torch.nn.functional as F
    
    class ImdbModel(nn.Module):
        def __init__(self):
            super(ImdbModel,self).__init__()
            self.embedding = nn.Embedding(num_embeddings=len(config.ws),embedding_dim=300,padding_idx=config.ws.PAD)
            self.lstm = nn.LSTM(input_size=200,hidden_size=64,num_layers=2,batch_first=True,bidirectional=True,dropout=0.5)
            self.fc1 = nn.Linear(64*2,64)
            self.fc2 = nn.Linear(64,2)
    
        def forward(self,input):
            '''
            :param input: 
            :return: 
            '''
            input_embeded = self.embedding(input)    #[batch_size,seq_len,200]
    
            output,(h_n,c_n) = self.lstm(input_embeded)
            out = torch.cat(h_n[-1,:,:],h_n[-2,:,:],dim=-1) #拼接正向最后一个输出和反向最后一个输出
    
            #进行全连接
            out_fc1 = self.fc1(out)
            #进行relu
            out_fc1_relu = F.relu(out_fc1)
            #全连接
            out = self.fc2(out_fc1_relu)
            return  F.log_softmax(out,dim=-1)
    

      train.py

    '''
    进行模型的训练
    '''
    import torch
    
    import config
    from model import ImdbModel
    from dataset import get_dataloader
    from torch.optim import Adam
    from tqdm import tqdm
    import torch.nn.functional as F
    import numpy as np
    import matplotlib.pyplot as plt
    from eval import eval
    
    model = ImdbModel().to(config.device)
    optimizer = Adam(model.parameters(),lr=0.001)
    loss_list = []
    
    def train(epoch):
        train_dataloader = get_dataloader(train=True)
        bar = tqdm(train_dataloader,total=len(train_dataloader))
    
        for idx,(input,target) in enumerate(bar):
            optimizer.zero_grad()
            input = input.to(config.device)
            target = target.to(config.device)
            output = model(input)
            loss = F.nll_loss(output,target)
            loss.backward()
            loss_list.append(loss.item())
            optimizer.step()
            bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,idx,np.mean(loss_list)))
    
            if idx%10 == 0:
                torch.save(model.state_dict(),"./models/model.pkl")
                torch.save(optimizer.state_dict(),"./models/optimizer.pkl")
    
    if __name__ == '__main__':
        for i in range(5):
            train(i)
            eval()
        plt.figure(figsize=(20,8))
        plt.plot(range(len(loss_list)),loss_list)
    

      eval.py

    '''
    进行模型的训练
    '''
    import torch
    
    import config
    from model import ImdbModel
    from dataset import get_dataloader
    from torch.optim import Adam
    from tqdm import tqdm
    import torch.nn.functional as F
    import numpy as np
    import matplotlib.pyplot as plt
    
    
    
    def eval():
        model = ImdbModel().to(config.device)
        model.load_state_dict(torch.load("./models/model.pkl"))
        model.eval()
        loss_list = []
        acc_list = []
        test_dataloader = get_dataloader(train=False)
        with torch.no_grad():
            for input,target in test_dataloader:
                input = input.to(config.device)
                target = target.to(config.device)
                output = model(input)
                loss = F.nll_loss(output,target)
                loss_list.append(loss.item())
                #准确率
                pred= output.max(dim = -1)[-1]
                acc_list.append(pred.eq(target).cpu().float().mean())
            print("loss:{:.6f},acc:{}".format(np.mean(loss_list),np.mean(acc_list)))
    
    
    if __name__ == '__main__':
        eval()
    

      

    多思考也是一种努力,做出正确的分析和选择,因为我们的时间和精力都有限,所以把时间花在更有价值的地方。
  • 相关阅读:
    前端css常用class命名id命名
    javaScript获取url问号后面的参数
    ASP.NET MVC 基础知识整理(一)
    Java基础概念(二)
    Java基础概念(一)
    ionic隐藏头部导航栏
    ionic开发中页面跳转隐藏底部Ttab
    /Date(1354116249000)/ 这样的格式怎么转成时间格式 JS
    ionic ng-repeat 循环传值
    ionic页面跳转传值 ng-click
  • 原文地址:https://www.cnblogs.com/LiuXinyu12378/p/12325009.html
Copyright © 2020-2023  润新知