随着BERT大火之后,很多BERT的变种,这里借用Huggingface工具来简单实现一个文本分类,从而进一步通过Huggingface来认识BERT的工程上的实现方法。
1、load data
train_df = pd.read_csv('../data/train.tsv',delimiter=' ',names=['text','label']) print(train_df.shape) train_df.head()
sentences = list(train_df['text'])
targets =train_df['label'].values
2、token encodding
#如果token要封装到自定义model类中的话,则需要指定max_len tokenizer=BertTokenizer.from_pretrained('bert-base-uncased') max_length=32 sentences_tokened=tokenizer(sentences,padding=True,truncation=True,max_length=max_length,return_tensors='pt') targets=torch.tensor(targets)
3、encoding data
# from torchvision import transforms,datasets from torch.utils.data import Dataset,DataLoader,random_split class DataToDataset(Dataset): def __init__(self,encoding,labels): self.encoding=encoding self.labels=labels def __len__(self): return len(self.labels) def __getitem__(self,index): return self.encoding['input_ids'][index],self.encoding['attention_mask'][index],self.labels[index] #封装数据 datasets=DataToDataset(sentences_tokened,targets) train_size=int(len(datasets)*0.8) test_size=len(datasets)-train_size print([train_size,test_size]) train_dataset,val_dataset=random_split(dataset=datasets,lengths=[train_size,test_size]) BATCH_SIZE=64 #这里的num_workers要大于0 train_loader=DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5) val_loader=DataLoader(dataset=val_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5)#
4、create model
class BertTextClassficationModel(nn.Module): def __init__(self): super(BertTextClassficationModel,self).__init__() self.bert=BertModel.from_pretrained('bert-base-uncased') self.dense=nn.Linear(768,2) #768 input, 2 output def forward(self,ids,mask): out,_=self.bert(input_ids=ids,attention_mask=mask) out=self.dense(out[:,0,:]) return out mymodel=BertTextClassficationModel() #获取gpu和cpu的设备信息 device=torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device=",device) if torch.cuda.device_count()>1: print("Let's use ",torch.cuda.device_count(),"GPUs!") mymodel=nn.DataParallel(mymodel) mymodel.to(device)
5、train model
loss_func=nn.CrossEntropyLoss() optimizer=optim.Adam(mymodel.parameters(),lr=0.0001) from sklearn.metrics import accuracy_score def flat_accuracy(preds,labels): pred_flat=np.argmax(preds,axis=1).flatten() labels_flat=labels.flatten() return accuracy_score(labels_flat,pred_flat) epochs=3 for epoch in range(epochs): train_loss = 0.0 train_acc=0.0 for i,data in enumerate(train_loader): input_ids,attention_mask,labels=[elem.to(device) for elem in data] #优化器置零 optimizer.zero_grad() #得到模型的结果 out=mymodel(input_ids,attention_mask) #计算误差 loss=loss_func(out,labels) train_loss += loss.item() #误差反向传播 loss.backward() #更新模型参数 optimizer.step() #计算acc out=out.detach().numpy() labels=labels.detach().numpy() train_acc+=flat_accuracy(out,labels) print("train %d/%d epochs Loss:%f, Acc:%f" %(epoch,epochs,train_loss/(i+1),train_acc/(i+1)))
6、evaluate
print("evaluate...") val_loss=0 val_acc=0 mymodel.eval() for j,batch in enumerate(val_loader): val_input_ids,val_attention_mask,val_labels=[elem.to(device) for elem in batch] with torch.no_grad(): pred=mymodel(val_input_ids,val_attention_mask) val_loss+=loss_func(pred,val_labels) pred=pred.detach().cpu().numpy() val_labels=val_labels.detach().cpu().numpy() val_acc+=flat_accuracy(pred,val_labels) print("evaluate loss:%d, Acc:%d" %(val_loss/len(val_loader),val_acc/len(val_loader)))