先运行main.py进行文本序列化,再train.py模型训练
dataset.py
from torch.utils.data import DataLoader,Dataset import torch import os from utils import tokenlize import config class ImdbDataset(Dataset): def __init__(self,train=True): super(ImdbDataset,self).__init__() data_path = r"H: 73-nlp自然语言处理-v5.bt38[周大伟] 73-nlp自然语言处理-v5.bt38[周大伟]第四天代码dataaclImdb_v1aclImdb" data_path += r" rain" if train else r" est" self.total_path = [] for temp_path in [r"pos",r" eg"]: cur_path = data_path + temp_path self.total_path += [os.path.join(cur_path,i) for i in os.listdir(cur_path) if i.endswith(".txt")] def __getitem__(self, idx): file = self.total_path[idx] review = open(file,encoding="utf-8").read() review = tokenlize(review) label = int(file.split("_")[-1].split(".")[0]) label = 0 if label < 5 else 1 return review,label def __len__(self): return len(self.total_path) def collate_fn(batch): ''' 对batch数据进行处理 :param batch: :return: ''' reviews,labels = zip(*batch) reviews = torch.LongTensor([config.ws.transform(i,max_len=config.max_len) for i in reviews]) labels = torch.LongTensor(labels) return reviews,labels def get_dataloader(train): imdbdataset = ImdbDataset(train=True) batch_size = config.train_batch_size if train else config.test_batch_size return DataLoader(imdbdataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn) if __name__ == '__main__': # dataset = ImdbDataset(train=True) # print(dataset[1]) for idx,(review,label) in enumerate(get_dataloader(train=True)): print(review) print(label) break
utils.py
""" 实现额外的方法 """ import re def tokenlize(sentence): """ 进行文本分词 :param sentence: str :return: [str,str,str] """ fileters = ['!', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\', ']', '^', '_', '`', '{', '|', '}', '~', ' ', ' ', 'x97', 'x96', '”', '“', ] sentence = sentence.lower() #把大写转化为小写 sentence = re.sub("<br />"," ",sentence) # sentence = re.sub("I'm","I am",sentence) # sentence = re.sub("isn't","is not",sentence) sentence = re.sub("|".join(fileters)," ",sentence) result = [i for i in sentence.split(" ") if len(i)>0] return result
word_sequence.py
''' 文本序列化 ''' class WordSequence(): UNK_TAG = "<UNK>" PAD_TAG = "<PAD>" UNK = 1 PAD = 0 def __init__(self): self.dict = { self.UNK_TAG:self.UNK, self.PAD_TAG:self.PAD } self.count = {} def fit(self,sentence): ''' 统计词频 :param sentence: :return: ''' for word in sentence: self.count[word] = self.count.get(word,0)+1 def build_vocab(self,min_count=0,max_count = None,max_features = None): """ 根据条件构建 词典 :param min_count:最小词频 :param max_count: 最大词频 :param max_features: 最大词语数 :return: """ if min_count is not None: self.count = {word:count for word,count in self.count.items() if count >min_count} if max_count is not None: self.count = {word:count for word,count in self.count.items() if count<max_count} if max_features is not None: #排序 self.count = dict(sorted(self.count.items(),lambda x:x[-1],reverse=True)[:max_features]) for word in self.count: self.dict[word] = len(self.dict) #每次word对应一个数字 #把dict进行翻转 self.inverse_dict = dict(zip(self.dict.values(),self.dict.keys())) def transform(self,sentence,max_len =None): ''' 把句子转化为数字序列 :param sentence: :return: ''' if len(sentence) > max_len: sentence = sentence[:max_len] else: sentence = sentence + [self.PAD_TAG]*(max_len-len(sentence)) return [self.dict.get(i,1) for i in sentence] def inverse_transform(self,incides): """ 把数字序列转化为字符 :param incides: :return: """ return [self.inverse_dict.get(i,"<UNK>") for i in incides] def __len__(self): return len(self.dict) if __name__ == '__main__': sentences = [["今天","天气","很","好"], ["今天","去","吃","什么"]] ws = WordSequence() for sentence in sentences: ws.fit(sentence) ws.build_vocab(min_count=0) print(ws.dict) ret = ws.transform(["好","热","呀","呀","呀","呀","呀","呀","呀"],max_len=5) print(ret) ret = ws.inverse_transform(ret) print(ret)
main.py
from word_sequence import WordSequence from dataset import get_dataloader import pickle from tqdm import tqdm if __name__ == '__main__': ws = WordSequence() train_data = get_dataloader(True) test_data = get_dataloader(False) for reviews,labels in tqdm(train_data,total=len(train_data)): for review in reviews: ws.fit(review) for reviews,labels in tqdm(test_data,total=len(test_data)): for review in reviews: ws.fit(review) print("正在建立...") ws.build_vocab() print(len(ws)) pickle.dump(ws,open("./models/ws.pkl","wb"))
model.py
""" 构建模型 """ import torch.nn as nn import config import torch.nn.functional as F class ImdbModel(nn.Module): def __init__(self): super(ImdbModel,self).__init__() self.embedding = nn.Embedding(num_embeddings=len(config.ws),embedding_dim=300,padding_idx=config.ws.PAD) self.fc = nn.Linear(config.max_len*300,2) def forward(self,input): ''' :param input: :return: ''' input_embeded = self.embedding(input) input_embeded_viewed = input_embeded.view(input_embeded.size(0),-1) out = self.fc(input_embeded_viewed) return F.log_softmax(out,dim=-1)
LSTMmodel.py
""" 构建模型 """ import torch.nn as nn import torch import config import torch.nn.functional as F class ImdbModel(nn.Module): def __init__(self): super(ImdbModel,self).__init__() self.embedding = nn.Embedding(num_embeddings=len(config.ws),embedding_dim=300,padding_idx=config.ws.PAD) self.lstm = nn.LSTM(input_size=200,hidden_size=64,num_layers=2,batch_first=True,bidirectional=True,dropout=0.5) self.fc1 = nn.Linear(64*2,64) self.fc2 = nn.Linear(64,2) def forward(self,input): ''' :param input: :return: ''' input_embeded = self.embedding(input) #[batch_size,seq_len,200] output,(h_n,c_n) = self.lstm(input_embeded) out = torch.cat(h_n[-1,:,:],h_n[-2,:,:],dim=-1) #拼接正向最后一个输出和反向最后一个输出 #进行全连接 out_fc1 = self.fc1(out) #进行relu out_fc1_relu = F.relu(out_fc1) #全连接 out = self.fc2(out_fc1_relu) return F.log_softmax(out,dim=-1)
train.py
''' 进行模型的训练 ''' import torch import config from model import ImdbModel from dataset import get_dataloader from torch.optim import Adam from tqdm import tqdm import torch.nn.functional as F import numpy as np import matplotlib.pyplot as plt from eval import eval model = ImdbModel().to(config.device) optimizer = Adam(model.parameters(),lr=0.001) loss_list = [] def train(epoch): train_dataloader = get_dataloader(train=True) bar = tqdm(train_dataloader,total=len(train_dataloader)) for idx,(input,target) in enumerate(bar): optimizer.zero_grad() input = input.to(config.device) target = target.to(config.device) output = model(input) loss = F.nll_loss(output,target) loss.backward() loss_list.append(loss.item()) optimizer.step() bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,idx,np.mean(loss_list))) if idx%10 == 0: torch.save(model.state_dict(),"./models/model.pkl") torch.save(optimizer.state_dict(),"./models/optimizer.pkl") if __name__ == '__main__': for i in range(5): train(i) eval() plt.figure(figsize=(20,8)) plt.plot(range(len(loss_list)),loss_list)
eval.py
''' 进行模型的训练 ''' import torch import config from model import ImdbModel from dataset import get_dataloader from torch.optim import Adam from tqdm import tqdm import torch.nn.functional as F import numpy as np import matplotlib.pyplot as plt def eval(): model = ImdbModel().to(config.device) model.load_state_dict(torch.load("./models/model.pkl")) model.eval() loss_list = [] acc_list = [] test_dataloader = get_dataloader(train=False) with torch.no_grad(): for input,target in test_dataloader: input = input.to(config.device) target = target.to(config.device) output = model(input) loss = F.nll_loss(output,target) loss_list.append(loss.item()) #准确率 pred= output.max(dim = -1)[-1] acc_list.append(pred.eq(target).cpu().float().mean()) print("loss:{:.6f},acc:{}".format(np.mean(loss_list),np.mean(acc_list))) if __name__ == '__main__': eval()