# In[1]
import pandas as pd
import numpy as np
import json
import os
import re
# In[2]
# !pwd
os.chdir('./root/FAQ/')
# In[2]
with open('./data/all_data.txt', 'r') as f:
data = f.read().split('[SEP]')
AQ = pd.DataFrame(data, columns=['question'])
# In[3]
# 冬奥会类(可以回答),设置为正例(2)
AQ['question'] = AQ['question'].str.strip()
AQ['label'] = 2
# In[3]
AQ # 冬奥会类, 后面只取一部分
# In[4]
# 数据集1(因为量少,多弄几个)
# 体育-非东奥; 非体育类;
f = open('./data/Negative.json', 'rb')
line = f.read().decode('utf8', 'ignore')
f.close()
with open('./data/Negative.txt', 'w') as f:
f.write('[' + ','.join(line.split()) + ']')
# In[5]
# 体育类-非东奥; 非体育类
NoAQ = pd.read_json('./data/Negative.txt')
NoAQ['title']
# In[4]
# 数据集2
# 体育-非东奥; 非体育类;
f = open('./data/Negative02.json', 'rb')
line2 = f.read().decode('utf8', 'ignore')
f.close()
with open('./data/Negative02.txt', 'w') as f:
f.write('[' + ','.join(line2.split()) + ']')
# In[5]
NoAQ02 = pd.read_json('./data/Negative02.txt')
NoAQ02['title']
# 数据集3
# In[5]
NoAQ = NoAQ.append(NoAQ02)
print(len(NoAQ))
# In[6]
# 285155
# 285155
train_len = len(NoAQ)
AQ = AQ.iloc[:train_len]
print('东奥(可回答): ', len(AQ))
# 285155
print('体育-非东奥 + 非体育类:', train_len)
AQ
# In[7]
# 筛选体育-非东奥;非体育类;
# 改列名
NoAQ = NoAQ.drop(labels=['answer', 'desc', 'url'], axis=1)
NoAQ.columns = ['question']
NoAQ
# In[8]
# 非体育类:0
NoAQ['label'] = 0
NoAQ
# In[9]
# 体育类,非东奥类:1
sports = ['雪', '赢', '速', '跳', '滑', '冬', '自由', '冰', '剧烈', '开赛', 'vs', '武术', '奥运会', '健身', '跑步', '打球', '强', '壮', '体育', '运动员', '运动', '活动', '训练', '得分', '比赛', '参赛', '赢', '球']
found = NoAQ['question'].str.contains('|'.join(sports))
sports_idx = NoAQ['question'][found].index
print(len(sports_idx))
NoAQ['label'].iloc[sports_idx] = 1
# In[9]
NoAQ.loc[NoAQ['label'] == 1]
NoAQ.loc[NoAQ['label'] == 0]
# In[10]
# 整合数据
AQ = AQ.append(NoAQ)
# In[11]
AQ
# In[11]
AQ.to_csv('./data/FAQ.csv', sep=' ')
# In[12]
test = pd.read_csv('./data/FAQ.csv', sep=' ')
test
# %%