------------恢复内容开始------------
一、背景
本项目为疫情期间网民情绪识别,PaddlePaddle出品的预训练模型管理和迁移学习工具,便捷地获取PaddlePaddle生态下的预训练模型,完成模型的管理和一键预测。配合使用Fine-tune API,可以基于大规模预训练模型快速完成迁移学习,让预训练模型能更好地服务于用户特定场景的应用。所以本项目将采用百度出品的PaddleHub预训练模型微调工具,快速构建比赛方案。
二、代码
# # 解压数据集
# !cd data/data22724 && unzip test_dataset.zip
# !cd data/data22724 && unzip "train_ dataset.zip"
# !hub install ernie
import pandas as pd
import numpy as np
import jieba
import re
import paddlehub as hub
from sklearn.model_selection import StratifiedKFold
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from matplotlib import pyplot as plt
%matplotlib inline
unuseful = [' ', '
', '2[u4e00-u9fa5]{2,7}·.*??', '【.*?】', '//@.*?:', '//@.*?:', '回复@.*?:', 'O网页链接', '?展开全文c', '我免费围观了.*?~O微博问答?', '#.*?#', '?', '[A-Za-z0-9]',
'/u0800-/u4e00', '-', '、','~','『','』','—','(.*?)','年','月','日','(.*?)', '◎', '"','"']
with open(file='/home/aistudio/data/data22724/nCoV_100k_train.labled.csv', mode='r',encoding='gb18030', errors='ignore') as fp:
train_labled = pd.read_csv(fp)
train_labled = train_labled[train_labled['情感倾向'].isin(['-1','0','1'])]
for content in unuseful:
train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace(content,'')
train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace('!{2,}','!').replace("《", "").replace("》", "").replace('。{2,}','。').replace('.+','.').replace('【','').replace('】','')
train_labled['微博中文内容'] = train_labled['微博中文内容'].str.replace('?{2,}','?').replace('.{2,}','.').replace(' ','')
with open(file='/home/aistudio/data/data22724/nCov_10k_test.csv', mode='r',encoding='gb18030', errors='ignore') as fp:
test = pd.read_csv(fp)
for content in unuseful:
test['微博中文内容'] = test['微博中文内容'].str.replace(content,'')
test['微博中文内容'] = test['微博中文内容'].str.replace('!{2,}','!').replace("《", "").replace("》", "").replace('。{2,}','。').replace('.+','.').replace('【','').replace('】','')
test['微博中文内容'] = test['微博中文内容'].str.replace('?{2,}','?').replace('.{2,}','.').replace(' ','')
# train_labled[['微博中文内容', '情感倾向']].to_csv('train.txt')
#去除无意义字符
def del_reply_mark(sentence):
output = re.sub(unuseful[0], '', sentence)
for cont in unuseful[1:]:
output = re.sub(cont, '', output)
# for cont in stars[1:]:
# output = re.sub(cont, '', output)
if output == "":
output = "***"
return output
#字符替换
def rep_chn_punc(sentence):
table = {ord(f): ord(t) for f, t in zip(
u',。!?【】()%#@&1234567890①②③④⑤、·:[]():;',
u',.!?....%#@&123456789012345,........')}
output = sentence.translate(table).replace("...", "…").replace("《", "").replace("》", "").replace("℃", "度")
.replace("——", "").replace("..", "…").replace("「", "").replace("」", "").replace("....", "…")
.replace(".....", "…").replace("T T", "TT")
.replace("T_T", "TT")
return output
def chn_tokenize(sentence):
line_list = jieba.lcut(sentence, HMM=True)
out_str = ''
for word in line_list:
if word not in stopwords:
if word != ' ':
out_str += word
return out_str
folds=5
sfolder = StratifiedKFold(n_splits=folds,random_state=1,shuffle=True)
train_labled = train_labled[['微博中文内容', '情感倾向']]
fold=0
for train_index, valid_index in sfolder.split(train_labled['微博中文内容'], train_labled['情感倾向']):
train = train_labled.iloc[train_index.tolist()]
valid = train_labled.iloc[valid_index.tolist()]
train.to_csv('train_' + str(fold) + '.txt', index=False, header=False, sep=' ')
valid.to_csv('valid_' + str(fold) + '.txt', index=False, header=False, sep=' ')
fold +=1
class MyDataset(BaseNLPDataset):
"""DemoDataset"""
def __init__(self,train_file_path="train_0.txt",dev_file_path="valid_0.txt"):
# 数据集存放位置
self.dataset_dir = "./"
super(MyDataset, self).__init__(
base_path=self.dataset_dir,
train_file=train_file_path,
dev_file=dev_file_path,
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
# 数据集类别集合
label_list=["-1", "0", "1"])
p_idx = 2
for fold in range(0,folds):
if fold != p_idx:
continue
module = hub.Module(name="ernie")
strategy = hub.AdamWeightDecayStrategy(
weight_decay=0.01,
warmup_proportion=0.1,
learning_rate=5e-5)
data = test[['微博中文内容']].fillna(' ').values.tolist()
dataset = MyDataset(train_file_path='train_' + str(fold) + '.txt', dev_file_path='valid_' + str(fold) + '.txt')
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path(),
max_seq_len=170)
inv_label_map = {val: key for key, val in reader.label_map.items()}
config = hub.RunConfig(
use_cuda=True,
num_epoch=3,
checkpoint_dir="model_"+str(fold),
batch_size=64,
eval_interval=500,
strategy=strategy)
inputs, outputs, program = module.context(trainable=True, max_seq_len=170)
pooled_output = outputs["pooled_output"]
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config,
metrics_choices=["f1"])
run_states = cls_task.finetune_and_eval()
run_states = cls_task.predict(data=data)
results = [run_state.run_results for run_state in run_states]
try:
proba += np.vstack([r[0] for r in results])/5
except:
proba = np.vstack([r[0] for r in results])/5
prediction = list(np.argmax(proba, axis=1))
prediction = [inv_label_map[p] for p in prediction]
submission = pd.DataFrame()
submission['id'] = test['微博id'].values
submission['id'] = submission['id'].astype(str) + ' '
submission['y'] = prediction
np.save('proba' + str(p_idx) +'.npy', proba)
submission.to_csv('result.csv', index=False)
submission['text'] = test[['微博中文内容']].fillna(' ').values
submission['label'] = submission['y'].map({-1: '消极', 0: '中性', 1: '积极'})
result = pd.read_csv('result.csv')
result.isna().sum()
len(result)
sub = pd.read_csv('/home/aistudio/data/data22724/submit_example.csv')
result['id'] = sub['id']
result.to_csv('result.csv', index=False)
proba0 = np.load('proba0.npy')
proba1 = np.load('proba1.npy')
proba2 = np.load('proba2.npy')
proba = proba0 + proba1 + proba2
prediction = list(np.argmax(proba, axis=1))
prediction = [inv_label_map[p] for p in prediction]
submission = pd.DataFrame()
submission['id'] = test['微博id'].values
submission['id'] = submission['id'].astype(str) + ' '
submission['y'] = prediction
np.save('proba' + str(p_idx) +'.npy', proba)
submission.to_csv('result.csv', index=False)
submission['text'] = test[['微博中文内容']].fillna(' ').values
submission['label'] = submission['y'].map({-1: '消极', 0: '中性', 1: '积极'})
result = pd.read_csv('result.csv')
result.isna().sum()
len(result)
sub = pd.read_csv('/home/aistudio/data/data22724/submit_example.csv')
result['id'] = sub['id']
result.to_csv('result.csv', index=False)
三、总结
这个假期了解并学习paddlepaddle、python、linux相关的知识,可以说只学到了一点皮毛,只掌握了python等一些基本知识,对paddlehub的相关应用不是特别的熟悉,不能够独立完整的进行代码相关的思考,我应该更加努力学习,寻找更多的资源,将新知识以及陌生的知识点演变为属于自己的知识。