总共是下面几个文件:
注意,最后一个是json文件,里面是电影影评数据集MR的划分出来的训练集生成的词典。是个字典文件,也可以自己再弄一个。
在训练集上训练了10个epoch,结果大概是上图这个样子
1、创建model_para.py文件,里面是模型的超参数。
import argparse
class Hpara():
parser = argparse.ArgumentParser()
############# insert paras #############
parser.add_argument('--batch_size',default = 16, type = int) # batch_size
parser.add_argument('--epochs',default = 1, type = int) # epochs
parser.add_argument('--lr',default = 3*10**(-5), type = float) # learning rate
parser.add_argument('--l2_lambda',default = 0.0, type = float) # L2_loss lambda
parser.add_argument('--seg_net_nums_layer',default = 6, type = int) # segnet Rnn Layers
parser.add_argument('--seg_net_dropout_rate',default = 0.2, type = float) # segnet dropout rate
parser.add_argument('--seg_net_hidden_size',default = 64, type = int) # segnet hidden size
parser.add_argument('--seg_net_n_best',default = 12, type = int) # sample nums
parser.add_argument('--seg_net_beam_size',default = 8, type = int) # beam size
parser.add_argument('--seg_net_rnntype',default = 'GRU', type = str) # segnet Rnn Type
parser.add_argument('--is_bi_rnn',default = True, type = bool) # segnet bi_Rnn
parser.add_argument('--unfreezing_bert_nums',default = 12, type = int)
parser.add_argument('--is_training_bert',default = True, type = bool)
parser.add_argument('--is_load_best_model', default = False ,type = bool)
## MLP model paras
parser.add_argument('--MLP_hidden_nums_layer',default = 1, type = int)
parser.add_argument('--Bert_drop_out',default = 0.1, type = float)
## SA model paras
parser.add_argument('--SA_hidden_dim',default = 256, type = int)
parser.add_argument('--SA_is_bi',default = True, type = bool)
parser.add_argument('--SA_dropout_rate',default = 0.4, type = float)
parser.add_argument('--SA_word_dim',default = 300, type = int)
parser.add_argument('--SA_vocab_size',default = 16783, type = int)
parser.add_argument('--SA_layer_nums',default = 2, type = int)
parser.add_argument('--SA_Rnn_type',default = 'LSTM', type = str)
parser.add_argument('--pretrained_bert_path',default = '/bert-base-uncased', type = str)
parser.add_argument('--bert_out_size',default = 768, type = int)
parser.add_argument('--bert_vocab_path',default = /bert-base-uncased/vocab.txt', type = str)
parser.add_argument('--saved_model_path',default = './best_model', type = str)
parser.add_argument('--final_model_path',default = './final_epoch_saved_model', type = str)
## SegNet model paras
parser.add_argument('--pretrained_segnet_model',default = './segnet_model/trained_model.torchsave', type = str)
parser.add_argument('--seg_net_vocab',default = './segnet_model/all_vocabulary.pickle', type = str)
parser.add_argument('--seg_net_vocab_size',default = 16703, type = int)
parser.add_argument('--seg_net_embed_size',default = 300, type = int)
parser.add_argument('--seg_net_use_cuda',default = False, type = bool)
parser.add_argument('--seg_net_finetuning',default = False, type = bool)
parser.add_argument('--seg_net_isbanor',default = True, type = bool)
## Other paras
parser.add_argument('--emotion_label_nums',default = 2, type = int)
parser.add_argument('--train_csv',default = r'./data/train_1.csv', type = str)
parser.add_argument('--test_csv',default = r'./data/test_1.csv', type = str)
parser.add_argument('--val_csv',default = r'./data/val_1.csv', type = str)
这个里面的参数是我在别的一个项目里面直接复制过来的,很多参数没有修改,实际使用到的参数很少。可以在代码里面自己看一下
2、创建数据处理文件data_utils.py
该文件主要用来加载数据集,并且分成batch输入到模型里面,还有最重要的对文本按照长度进行排序,这样做的目的是LSTM或者别的RNN在计算的时候不考虑padding。
import numpy as np
import json
import pandas as pd
def del_none(text): # delete ‘’ 删除使用空格切分之后句子里面的长度为0的字符串
new_text = []
for t in text:
ws = t.split(' ')
ws = [w for w in ws if w != '']
new_text.append(' '.join(ws))
return new_text
def read_csv(para):# read Moive Review DataSet
train_data = pd.read_csv(para.train_csv, delimiter=',')
test_data = pd.read_csv(para.test_csv, delimiter=',')
val_data = pd.read_csv(para.val_csv, delimiter=',')
train_text = list(train_data['INPUT'])
train_label = list(train_data['OUTPUT'])
test_text = list(test_data['INPUT'])
test_label = list(test_data['OUTPUT'])
val_text = list(val_data['INPUT'])
val_label = list(val_data['OUTPUT'])
train_text = del_none(train_text)
test_text = del_none(test_text)
val_text = del_none(val_text)
train_label = np.array(train_label, dtype=np.int32)
test_label = np.array(test_label, dtype=np.int32)
val_label = np.array(val_label, dtype=np.int32)
return train_text, test_text, val_text, train_label, test_label, val_label
def create_dict(train_text):
for t in train_text:
ws = t.split(' ')
aw.extend(ws)
vocab = list(set(aw))
vd = dict(zip(vocab, range(2,len(vocab)+2)))
vd['UNKNOWN'] = 1
vd['PADDING'] = 0
with open('vd.json', 'w', encoding='utf-8') as f:
json.dump(vd, f)
def get_vd(path):
f = open(path,'r', encoding='utf-8')
vd = json.load(f)
return vd
def text2id(text, vd):
bs = len(text)
lens = [len(t.split(' ')) for t in text]
ml = max(lens)
text_id = np.zeros(shape = (bs, ml), dtype=np.int32)
words = list(vd.keys())
for i in range(bs):
ws = text[i].split(' ')
for j in range(lens[i]):
if ws[j] in words:
text_id[i,j] = vd[ws[j]]
else:
text_id[i, j] = vd['UNKNOWN']
return text_id, lens
def sort_data_by_length(textid, label, lengths):
lengths = np.array(lengths, dtype=np.int32)
idx = np.argsort(lengths)
idx = idx[::-1]
lengths = lengths[idx]
sc_id = textid[idx]
label = label[idx]
return sc_id, label, lengths
def batch_iter_str(text, label , batch_size = 16):
path = 'vd.json'
vd = get_vd(path)
data_nums = len(text)
num_batch = (data_nums + batch_size - 1) // batch_size
indices = np.random.permutation(np.arange(data_nums))
text = [text[i] for i in indices]
label = label[indices]
for i in range(num_batch):
start_offset = i * batch_size
end_offset = min(start_offset + batch_size, data_nums)
bt_text = text[start_offset:end_offset]
bt_label = label[start_offset:end_offset]
text_id, lens = text2id(bt_text, vd)
sc_id, bt_label, lengths = sort_data_by_length(text_id, bt_label, lens)
yield i, num_batch, sc_id, bt_label, lengths
3、创建Model.py文件
import torch.nn as nn
import torch
device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")
class sa_model(nn.Module):
def __init__(self, para):
super(sa_model, self).__init__()
self.para = para
if para.SA_Rnn_type == 'LSTM':
self.RNN = nn.LSTM(input_size = para.SA_word_dim,
hidden_size = para.SA_hidden_dim,
num_layers = para.SA_layer_nums,
batch_first = True,
dropout = para.SA_dropout_rate,
bidirectional = para.SA_is_bi)
elif para.SA_Rnn_type == 'GRU':
self.RNN = nn.GRU(input_size = para.SA_word_dim,
hidden_size = para.SA_hidden_dim,
num_layers = para.SA_layer_nums,
batch_first = True,
dropout = para.SA_dropout_rate,
bidirectional = para.SA_is_bi)
elif para.SA_Rnn_type == 'RNN':
self.RNN = nn.RNN(input_size = para.SA_word_dim,
hidden_size = para.SA_hidden_dim,
num_layers = para.SA_layer_nums,
batch_first = True,
dropout = para.SA_dropout_rate,
bidirectional = para.SA_is_bi)
self.nnEm = nn.Embedding(para.SA_vocab_size, para.SA_word_dim, padding_idx = 0).to(device)
self.criterion = nn.CrossEntropyLoss()
self.linear = nn.Linear(para.SA_hidden_dim*2, para.emotion_label_nums)
def forward(self, x, x_len, label):
x = self.nnEm(x)
packed_embeds = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True)
out, hs = self.RNN(packed_embeds)
out, _ = nn.utils.rnn.pad_packed_sequence(out,
batch_first=True)
h1 = hs[0][0]
h2 = hs[0][1]
h = torch.cat([h1, h2], dim = 1)
logits = self.linear(h)
loss = self.criterion(logits, label)
pred = torch.argmax(torch.nn.functional.softmax(logits, dim = 1), dim = 1)
return loss, pred
4、创建train.py文件
from Model import sa_model
from data_utils import batch_iter_str, read_csv
import torch
from model_para import Hpara
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
from sklearn.metrics import accuracy_score
device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")
print(device)
def train(text, label, model, epoch):
optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, model.parameters()), lr = para.lr * (0.95)**epoch)
pred_label = []
true_label = []
all_loss = []
model.train()
db = batch_iter_str(text, label)
for i, num_batch, sc_id, label, lengths in db:
sc_id = torch.tensor(sc_id).long().to(device)
label = torch.tensor(label).long().to(device)
lens = torch.tensor(lengths).long().to(device)
model.zero_grad()
loss ,pred = model(sc_id, lens, label)
all_loss.append(loss)
pred_label.append(pred)
true_label.append(label)
loss.backward()
optimizer.step()
if (i+1)%10 == 0:
print('%d step loss is : %f'%(i+1, loss.item()))
print('%d acc is : %f'%(i+1, accuracy_score(label.data.cpu().numpy(), pred.data.cpu().numpy())))
mean_loss = sum(all_loss) / num_batch
pred_label = torch.cat(pred_label, dim = 0)
true_label = torch.cat(true_label, dim = 0)
mean_acc = accuracy_score(true_label, pred_label)
return mean_loss, mean_acc
def test(saved_acc, text, label, model, mode = 'validation'):
pred_label = []
true_label = []
all_loss = []
model.eval()
db = batch_iter_str(text, label)
for i, num_batch, sc_id, label, lengths in db:
sc_id = torch.tensor(sc_id).long().to(device)
label = torch.tensor(label).long().to(device)
lens = torch.tensor(lengths).long().to(device)
model.zero_grad()
loss ,pred = model(sc_id, lens, label)
all_loss.append(loss)
pred_label.append(pred)
true_label.append(label)
mean_loss = sum(all_loss) / num_batch
pred_label = torch.cat(pred_label, dim = 0)
true_label = torch.cat(true_label, dim = 0)
mean_acc = accuracy_score(true_label, pred_label)
if mode == 'validation':# save the best model when the mode is 'validation'
if mean_acc >= saved_acc:
torch.save(model, os.path.join(para.saved_model_path,'My_model_pth'))
saved_acc = mean_acc
## TODO: the next step is to save the model every epoch
torch.save(model, os.path.join(para.final_model_path,'My_model_pth'))
return mean_loss, mean_acc, saved_acc
def load_model(para, model, best_model = True):
if best_model:
model_path = para.saved_model_path
else:
model_path = para.final_model_path
if os.path.exists(os.path.join(model_path, 'My_model_pth')):
print('model path is : ', model_path)
model = torch.load(os.path.join(model_path, 'My_model_pth'))
print('load pre-train')
else:
model = model
return model
if __name__ == '__main__':
saved_acc = 0.0
hp=Hpara()
parser = hp.parser
para = parser.parse_args()
train_text, test_text, val_text, train_label, test_label, val_label = read_csv(para)
model = sa_model(para)
model = model.to(device)
if para.is_load_best_model:
print('load the best model...')
_, _, saved_acc = test(saved_acc, val_text, val_label, model, mode = 'get_saved_acc')
model = load_model(para, model)
else:
print('load the final epoch model...')
_, _, saved_acc = test(saved_acc, val_text, val_label, model, mode = 'get_saved_acc')
model = load_model(para, model, best_model = False)
print('saved accuracy is :', saved_acc)
train_loss = []
train_acc = []
val_loss = []
val_acc = []
test_loss = []
test_acc = []
for i in range(para.epochs):
########### train ###########
train_mean_loss_i, train_mean_acc_i = train(train_text, train_label, model, i)
train_loss.append(train_mean_loss_i)
train_acc.append(train_mean_acc_i)
########## val #############
val_loss_i, val_acc_i, saved_acc = test(saved_acc, val_text, val_label, model, )
print('%d epoch val acc is', val_acc_i)
val_loss.append(val_loss_i)
val_acc.append(val_acc_i)
########## test ###########
test_loss_i, test_acc_i, saved_acc = test(saved_acc, test_text, test_label, model, mode = 'test')
print('%d epoch test acc is :', test_acc_i)
test_loss.append(test_loss_i)
test_acc.append(test_acc_i)
plt.figure(2)
plt.subplot(231)
plt.plot(train_loss)
plt.xlabel('epochs')
plt.ylabel('emotion_train_loss')
plt.subplot(232)
plt.plot(test_loss)
plt.xlabel('epochs')
plt.ylabel('emotion_test_loss')
plt.subplot(233)
plt.plot(val_loss)
plt.xlabel('epochs')
plt.ylabel('emotion_val_loss')
plt.subplot(234)
plt.plot(train_acc)
plt.xlabel('epochs')
plt.ylabel('emotion_train_acc')
plt.subplot(235)
plt.plot(test_acc)
plt.xlabel('epochs')
plt.ylabel('emotion_test_acc')
plt.subplot(236)
plt.plot(val_acc)
plt.xlabel('epochs')
plt.ylabel('emotion_val_acc')
plt.show()
5、完整的代码
链接:https://pan.baidu.com/s/1nWMThnz9eK8zVggQJ7Umag
提取码:0rwq
复制这段内容后打开百度网盘手机App,操作更方便哦