pandas字符串/列等操作

# In[1]
import pandas as pd 
import numpy as np
import json
import os 
import re 

# In[2]
# !pwd
os.chdir('./root/FAQ/')

# In[2]
with open('./data/all_data.txt', 'r') as f:
    data = f.read().split('[SEP]')
    AQ = pd.DataFrame(data, columns=['question'])
    
# In[3]
# 冬奥会类(可以回答)，设置为正例(2)
AQ['question'] = AQ['question'].str.strip()
AQ['label'] = 2

# In[3]
AQ          # 冬奥会类, 后面只取一部分

# In[4]
# 数据集1(因为量少，多弄几个)
# 体育-非东奥; 非体育类;
f = open('./data/Negative.json', 'rb')
line = f.read().decode('utf8', 'ignore')
f.close()
with open('./data/Negative.txt', 'w') as f:
    f.write('[' + ','.join(line.split()) + ']')

# In[5]
# 体育类-非东奥; 非体育类
NoAQ = pd.read_json('./data/Negative.txt')
NoAQ['title']

# In[4]
# 数据集2
# 体育-非东奥; 非体育类; 
f = open('./data/Negative02.json', 'rb')
line2 = f.read().decode('utf8', 'ignore')
f.close()
with open('./data/Negative02.txt', 'w') as f:
    f.write('[' + ','.join(line2.split()) + ']')

# In[5]
NoAQ02 = pd.read_json('./data/Negative02.txt')
NoAQ02['title']

# 数据集3


# In[5]
NoAQ = NoAQ.append(NoAQ02)
print(len(NoAQ))

# In[6]
# 285155
# 285155
train_len = len(NoAQ)
AQ = AQ.iloc[:train_len]
print('东奥(可回答): ', len(AQ))
# 285155
print('体育-非东奥 + 非体育类：', train_len) 
AQ

# In[7]
# 筛选体育-非东奥；非体育类；
# 改列名
NoAQ = NoAQ.drop(labels=['answer', 'desc', 'url'], axis=1)
NoAQ.columns = ['question']
NoAQ

# In[8]
# 非体育类：0
NoAQ['label'] = 0
NoAQ

# In[9]
# 体育类，非东奥类：1
sports = ['雪', '赢', '速', '跳', '滑', '冬', '自由', '冰', '剧烈', '开赛', 'vs', '武术', '奥运会', '健身', '跑步', '打球', '强', '壮', '体育', '运动员', '运动', '活动', '训练', '得分', '比赛', '参赛', '赢', '球']
found = NoAQ['question'].str.contains('|'.join(sports))
sports_idx = NoAQ['question'][found].index
print(len(sports_idx))
NoAQ['label'].iloc[sports_idx] = 1

# In[9]
NoAQ.loc[NoAQ['label'] == 1]
NoAQ.loc[NoAQ['label'] == 0]
# In[10]
# 整合数据
AQ = AQ.append(NoAQ)
# In[11]
AQ
# In[11]
AQ.to_csv('./data/FAQ.csv', sep='	')

# In[12]
test = pd.read_csv('./data/FAQ.csv', sep='	')
test
# %%

相关阅读:
oracle11g 卸载和安装（win7，32位）
MySQL忘记密码解决办法
GPIO硬件资源的申请，内核空间和用户空间的数据交换，ioctl(.....),设备文件的自动创建
模块参数，系统调用，字符设备编程重要数据结构，设备号的申请与注册，关于cdev的API
开发环境的搭建,符合导出，打印优先级阈值
定时器中断
Linux系统移植的重要文件
linux 相关指令
linux各文件夹含义和作用
外部中断实验

原文地址：https://www.cnblogs.com/douzujun/p/14375660.html