• paddlepaddle(3)


    ------------恢复内容开始------------

    一、背景

    本项目为疫情期间网民情绪识别,PaddlePaddle出品的预训练模型管理和迁移学习工具,便捷地获取PaddlePaddle生态下的预训练模型,完成模型的管理和一键预测。配合使用Fine-tune API,可以基于大规模预训练模型快速完成迁移学习,让预训练模型能更好地服务于用户特定场景的应用。所以本项目将采用百度出品的PaddleHub预训练模型微调工具,快速构建比赛方案。

    二、代码

    # # 解压数据集
    # !cd data/data22724 && unzip test_dataset.zip
    # !cd data/data22724 && unzip "train_ dataset.zip"
    # !hub install ernie
    
    import pandas as pd
    import numpy as np
    
    import jieba
    import re
    
    import paddlehub as hub
    from sklearn.model_selection import StratifiedKFold
    from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
    
    from matplotlib import pyplot as plt
    %matplotlib inline
    
    unuseful = ['	', '
    ', '2[u4e00-u9fa5]{2,7}·.*??', '【.*?】', '//@.*?:', '//@.*?:', '回复@.*?:', 'O网页链接', '?展开全文c', '我免费围观了.*?~O微博问答?', '#.*?#', '?', '[A-Za-z0-9]', 
                '/u0800-/u4e00', '-', '、','~','『','』','—','(.*?)','年','月','日','(.*?)', '◎', '"','"']
    
    with open(file='/home/aistudio/data/data22724/nCoV_100k_train.labled.csv', mode='r',encoding='gb18030', errors='ignore') as fp:
        train_labled = pd.read_csv(fp)
        train_labled = train_labled[train_labled['情感倾向'].isin(['-1','0','1'])]
        for content in unuseful:
            train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace(content,'')
        train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace('!{2,}','!').replace("《", "").replace("》", "").replace('。{2,}','。').replace('.+','.').replace('【','').replace('】','')
        train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace('?{2,}','?').replace('.{2,}','.').replace(' ','')
    
    with open(file='/home/aistudio/data/data22724/nCov_10k_test.csv', mode='r',encoding='gb18030', errors='ignore') as fp:
        test = pd.read_csv(fp)
        for content in unuseful:
            test['微博中文内容'] = test['微博中文内容'].str.replace(content,'')
        test['微博中文内容'] = test['微博中文内容'].str.replace('!{2,}','!').replace("《", "").replace("》", "").replace('。{2,}','。').replace('.+','.').replace('【','').replace('】','')
        test['微博中文内容'] = test['微博中文内容'].str.replace('?{2,}','?').replace('.{2,}','.').replace(' ','')
    # train_labled[['微博中文内容', '情感倾向']].to_csv('train.txt')
    
    #去除无意义字符
    def del_reply_mark(sentence):
        output = re.sub(unuseful[0], '', sentence)
        for cont in unuseful[1:]:
            output = re.sub(cont, '', output)
        # for cont in stars[1:]:
        #     output = re.sub(cont, '', output)
        if output == "":
            output = "***"
        return output
    
    #字符替换
    def rep_chn_punc(sentence):
        table = {ord(f): ord(t) for f, t in zip(
            u',。!?【】()%#@&1234567890①②③④⑤、·:[]():;',
            u',.!?....%#@&123456789012345,........')}
        output = sentence.translate(table).replace("...", "…").replace("《", "").replace("》", "").replace("℃", "度")
            .replace("——", "").replace("..", "…").replace("「", "").replace("」", "").replace("....", "…")
            .replace(".....", "…").replace("T T", "TT")
            .replace("T_T", "TT")
        return output
    
    def chn_tokenize(sentence):
        line_list = jieba.lcut(sentence, HMM=True)
        out_str = ''
        for word in line_list:
            if word not in stopwords:
                if word != '	':
                    out_str += word
        return out_str
    
    
    folds=5
    sfolder = StratifiedKFold(n_splits=folds,random_state=1,shuffle=True)
    train_labled = train_labled[['微博中文内容', '情感倾向']]
    fold=0
    for train_index, valid_index in sfolder.split(train_labled['微博中文内容'], train_labled['情感倾向']):
        train = train_labled.iloc[train_index.tolist()]
        valid = train_labled.iloc[valid_index.tolist()]
        train.to_csv('train_' + str(fold) + '.txt', index=False, header=False, sep='	')
        valid.to_csv('valid_' + str(fold) + '.txt', index=False, header=False, sep='	')
        fold +=1
    
    class MyDataset(BaseNLPDataset):
            """DemoDataset"""
            def __init__(self,train_file_path="train_0.txt",dev_file_path="valid_0.txt"):
                # 数据集存放位置
                self.dataset_dir = "./"
                super(MyDataset, self).__init__(
                    base_path=self.dataset_dir,
                    train_file=train_file_path,
                    dev_file=dev_file_path,
                    train_file_with_header=False,
                    dev_file_with_header=False,
                    test_file_with_header=False,
                    # 数据集类别集合
                    label_list=["-1", "0", "1"])
    
    
    p_idx = 2
    for fold in range(0,folds):
        if fold != p_idx:
            continue
        module = hub.Module(name="ernie")
        strategy = hub.AdamWeightDecayStrategy(
                    weight_decay=0.01,
                    warmup_proportion=0.1,
                    learning_rate=5e-5)
        data = test[['微博中文内容']].fillna(' ').values.tolist()
        dataset = MyDataset(train_file_path='train_' + str(fold) + '.txt', dev_file_path='valid_' + str(fold) + '.txt')
        reader = hub.reader.ClassifyReader(
            dataset=dataset,
            vocab_path=module.get_vocab_path(),
            sp_model_path=module.get_spm_path(),
            word_dict_path=module.get_word_dict_path(),
            max_seq_len=170)
        inv_label_map = {val: key for key, val in reader.label_map.items()}
        config = hub.RunConfig(
            use_cuda=True,
            num_epoch=3,
            checkpoint_dir="model_"+str(fold),
            batch_size=64,
            eval_interval=500,
            strategy=strategy)
        
        inputs, outputs, program = module.context(trainable=True, max_seq_len=170)
        pooled_output = outputs["pooled_output"]
    
        feed_list = [
            inputs["input_ids"].name,
            inputs["position_ids"].name,
            inputs["segment_ids"].name,
            inputs["input_mask"].name,
        ]
    
        cls_task = hub.TextClassifierTask(
                data_reader=reader,
                feature=pooled_output,
                feed_list=feed_list,
                num_classes=dataset.num_labels,
                config=config,
                metrics_choices=["f1"])
        
        run_states = cls_task.finetune_and_eval()
        run_states = cls_task.predict(data=data)
        results = [run_state.run_results for run_state in run_states]
        try:
            proba += np.vstack([r[0] for r in results])/5
        except:
            proba = np.vstack([r[0] for r in results])/5
    
    prediction = list(np.argmax(proba, axis=1))
    prediction = [inv_label_map[p] for p in prediction]
            
    submission = pd.DataFrame()
    submission['id'] = test['微博id'].values
    submission['id'] = submission['id'].astype(str) + ' '
    submission['y'] = prediction
    np.save('proba' + str(p_idx) +'.npy', proba)
    submission.to_csv('result.csv', index=False)
    submission['text'] = test[['微博中文内容']].fillna(' ').values
    submission['label'] = submission['y'].map({-1: '消极', 0: '中性', 1: '积极'})
    result = pd.read_csv('result.csv')
    result.isna().sum()
    len(result)
    sub = pd.read_csv('/home/aistudio/data/data22724/submit_example.csv')
    result['id'] = sub['id']
    result.to_csv('result.csv', index=False)
    
    proba0 = np.load('proba0.npy')
    proba1 = np.load('proba1.npy')
    proba2 = np.load('proba2.npy')
    proba = proba0 + proba1 + proba2
    prediction = list(np.argmax(proba, axis=1))
    prediction = [inv_label_map[p] for p in prediction]
            
    submission = pd.DataFrame()
    submission['id'] = test['微博id'].values
    submission['id'] = submission['id'].astype(str) + ' '
    submission['y'] = prediction
    np.save('proba' + str(p_idx) +'.npy', proba)
    submission.to_csv('result.csv', index=False)
    
    submission['text'] = test[['微博中文内容']].fillna(' ').values
    submission['label'] = submission['y'].map({-1: '消极', 0: '中性', 1: '积极'})
    result = pd.read_csv('result.csv')
    result.isna().sum()
    len(result)
    sub = pd.read_csv('/home/aistudio/data/data22724/submit_example.csv')
    result['id'] = sub['id']
    result.to_csv('result.csv', index=False)
    

    三、总结

    这个假期了解并学习paddlepaddle、python、linux相关的知识,可以说只学到了一点皮毛,只掌握了python等一些基本知识,对paddlehub的相关应用不是特别的熟悉,不能够独立完整的进行代码相关的思考,我应该更加努力学习,寻找更多的资源,将新知识以及陌生的知识点演变为属于自己的知识。

  • 相关阅读:
    css字体属性相关。
    子级用css float浮动 而父级div没高度不能自适应高度
    转载:基于Redis实现分布式锁
    LeetCode(53):最大子序和
    LeetCode(52):N皇后 II
    LeetCode(51):N皇后
    LeetCode(50):Pow(x, n)
    LeetCode(49): 字母异位词分组
    LeetCode(48):旋转图像
    LeetCode(47):全排列 II
  • 原文地址:https://www.cnblogs.com/8098pc/p/13592157.html
Copyright © 2020-2023  润新知