• 基于Huggingface使用BERT进行文本分类的fine-tuning


    随着BERT大火之后,很多BERT的变种,这里借用Huggingface工具来简单实现一个文本分类,从而进一步通过Huggingface来认识BERT的工程上的实现方法。

    1、load data

    train_df = pd.read_csv('../data/train.tsv',delimiter='	',names=['text','label'])
    print(train_df.shape)
    train_df.head()

    sentences = list(train_df['text'])
    targets =train_df['label'].values

    2、token encodding

    #如果token要封装到自定义model类中的话,则需要指定max_len
    tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
    max_length=32
    sentences_tokened=tokenizer(sentences,padding=True,truncation=True,max_length=max_length,return_tensors='pt')
    targets=torch.tensor(targets)

    3、encoding data

    # from torchvision import transforms,datasets
    from torch.utils.data import Dataset,DataLoader,random_split
    
    class DataToDataset(Dataset):
        def __init__(self,encoding,labels):
            self.encoding=encoding
            self.labels=labels
            
        def __len__(self):
            return len(self.labels)
            
        def __getitem__(self,index):
            return self.encoding['input_ids'][index],self.encoding['attention_mask'][index],self.labels[index]
    
    #封装数据
    datasets=DataToDataset(sentences_tokened,targets)
    train_size=int(len(datasets)*0.8)
    test_size=len(datasets)-train_size
    print([train_size,test_size])
    train_dataset,val_dataset=random_split(dataset=datasets,lengths=[train_size,test_size])
    
    BATCH_SIZE=64
    #这里的num_workers要大于0
    train_loader=DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5)
    
    val_loader=DataLoader(dataset=val_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5)#

    4、create model

    class BertTextClassficationModel(nn.Module):
        def __init__(self):
            super(BertTextClassficationModel,self).__init__()
            self.bert=BertModel.from_pretrained('bert-base-uncased')
            self.dense=nn.Linear(768,2)  #768 input, 2 output
            
        def forward(self,ids,mask):
            out,_=self.bert(input_ids=ids,attention_mask=mask)
            out=self.dense(out[:,0,:])
            return out
    
    
    mymodel=BertTextClassficationModel()
    
    
    #获取gpu和cpu的设备信息
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device=",device)
    if torch.cuda.device_count()>1:
        print("Let's use ",torch.cuda.device_count(),"GPUs!")
        mymodel=nn.DataParallel(mymodel)
    mymodel.to(device)

    5、train model

    loss_func=nn.CrossEntropyLoss()
    optimizer=optim.Adam(mymodel.parameters(),lr=0.0001)
    
    from sklearn.metrics import accuracy_score
    def flat_accuracy(preds,labels):
        pred_flat=np.argmax(preds,axis=1).flatten()
        labels_flat=labels.flatten()
        return accuracy_score(labels_flat,pred_flat)
    
    epochs=3
    for epoch in range(epochs):
        train_loss = 0.0
        train_acc=0.0
        for i,data in enumerate(train_loader):
            input_ids,attention_mask,labels=[elem.to(device) for elem in data]
            #优化器置零
            optimizer.zero_grad()
            #得到模型的结果
            out=mymodel(input_ids,attention_mask)
            #计算误差
            loss=loss_func(out,labels)
            train_loss += loss.item()
            #误差反向传播
            loss.backward()
            #更新模型参数
            optimizer.step()
            #计算acc 
            out=out.detach().numpy()
            labels=labels.detach().numpy()
            train_acc+=flat_accuracy(out,labels)
    
        print("train %d/%d epochs Loss:%f, Acc:%f" %(epoch,epochs,train_loss/(i+1),train_acc/(i+1)))

    6、evaluate

    print("evaluate...")
    val_loss=0
    val_acc=0
    mymodel.eval()
    for j,batch in enumerate(val_loader):
        val_input_ids,val_attention_mask,val_labels=[elem.to(device) for elem in batch]
        with torch.no_grad():
            pred=mymodel(val_input_ids,val_attention_mask)
            val_loss+=loss_func(pred,val_labels)
            pred=pred.detach().cpu().numpy()
            val_labels=val_labels.detach().cpu().numpy()
            val_acc+=flat_accuracy(pred,val_labels)
    print("evaluate loss:%d, Acc:%d" %(val_loss/len(val_loader),val_acc/len(val_loader)))
        
  • 相关阅读:
    消息中间件(一)MQ详解及四大MQ比较
    SIP协议
    PAT (Basic Level) Practice 1008 数组元素循环右移问题
    LeetCode-Algorithms 1. 两数之和
    PAT (Basic Level) Practice 1040 有几个PAT
    PAT (Basic Level) Practice 1023 组个最小数
    PAT (Basic Level) Practice 1021 个位数统计
    PAT (Basic Level) Practice 1007 素数对猜想
    PAT (Basic Level) Practice 1006 换个格式输出整数
    PAT (Basic Level) Practice 1004 成绩排名
  • 原文地址:https://www.cnblogs.com/ljy2013/p/13726148.html
Copyright © 2020-2023  润新知