# In[1]
import os
path = '/home/zjdou/jupyter/root/Smart-Writing/TextClassification/DATA'
os.chdir(path)
print(os.getcwd())
# In[2]
import pandas as pd
import numpy as np
file01 = pd.read_json('./raw_data/gov01.json')
file01
file02 = pd.read_json('./raw_data/gov02.json')
file02
total = file01.append(file02, ignore_index=True)
# In[1]
total
# In[2]
no_label_idx = total[total['topics'].apply(lambda x:x[0]) == ''].index
no_label_idx
total.drop(no_label_idx, inplace=True)
total.reset_index(drop=True, inplace=True)
# In[2]
total['topics'] = total['topics'].apply(lambda x:x[0])
total
total.to_json('./swtc/total.json')
# In[4]
# 分割数据
total = pd.read_json('./swtc/total.json')
# idx = total[total['topics'].apply(lambda x:x.find('/') != -1)].index
# total.iloc[idx]
total['topics'] = total['topics'].apply(lambda x:x.replace('/', '-'))
# idx = total[total['topics'].apply(lambda x:x.find('/') != -1)].index
# total.iloc[idx]
total.to_json('./swtc/total.json')
total
# In[1]
total['topics'] = total['topics'].apply(lambda x:x if x.find('-') != -1 else x+'-其他')
total.to_json('./swtc/total.json')
# In[2]
# 拓展数据
total = pd.read_json('./swtc/smartwrite2_train.json')
total
# In[3]
front_total = pd.DataFrame({'title':total.title,
'content':total['content'].apply(lambda x:x[:len(x)//2]),
'topics':total.topics})
beh_total = pd.DataFrame({'title':total.title,
'content':total['content'].apply(lambda x:x[len(x)//2:]),
'topics':total.topics})
total = front_total.append(beh_total, ignore_index=True)
# 再分一遍
front_total = pd.DataFrame({'title':total.title,
'content':total['content'].apply(lambda x:x[:len(x)//2]),
'topics':total.topics})
beh_total = pd.DataFrame({'title':total.title,
'content':total['content'].apply(lambda x:x[len(x)//2:]),
'topics':total.topics})
total = front_total.append(beh_total, ignore_index=True)
# print(len(total.content[0]))
# total.to_json('./swtc/smartwrite4_train.json')
# In[2]
import re
# total = pd.read_json('./swtc/smartwrite3_train.json')
total = pd.read_json('./swtc/new_total.json')
total['extra'] = total['content'].apply(lambda x: ','.join(sorted(set(re.findall('《(.*?)》', x)), key=lambda x:len(x), reverse=True))).to_list()
# In[3]
total.to_json('total_and_extra.json')
# In[4]
shuffle_total = total.sample(frac=1).reset_index(drop=True)
shuffle_total
# In[1]
total_len = len(shuffle_total)
train_len = total_len // 10 * 8
dev_len = total_len // 10 * 1
test_len = total_len - train_len - dev_len
print(train_len, dev_len, test_len)
train = shuffle_total.iloc[:train_len]
train.to_json('./swtc/smartwrite5_train.json')
dev = shuffle_total.iloc[train_len:train_len + dev_len]
dev.to_json('./swtc/smartwrite5_dev.json')
test = shuffle_total.iloc[train_len+dev_len: train_len+dev_len+test_len]
test.to_json('./swtc/smartwrite5_test.json')
# %%
# train.to_json('./swtc/')
# pd.read_json('./swtc/smartwrite_dev.json')
# a = pd.read_json('./raw_data/国务院部门文件_国务院政策文件库_中国政府网.json')
# b = pd.read_json('./raw_data/国务院文件_国务院政策文件库_中国政府网.json')
# len(a) + len(b)
# %%
train = pd.read_json('./swtc/smartwrite5_train.json')
train
dev = pd.read_json('./swtc/smartwrite5_dev.json')
dev
test = pd.read_json('./swtc/smartwrite5_test.json')
test
# In[1]
total = pd.read_json('./swtc/total.json')
topics = total['topics'].to_list()
print(topics)
# all_topics = len(topics)
# print(all_topics)
dup_topics = list(set(topics))
print(dup_topics, len(dup_topics))
# %%
topics_dic = dict()
for i, topic in enumerate(dup_topics):
topics_dic[topic] = i
print(topics_dic)