• pandas字符串/列等操作


    # In[1]
    import pandas as pd 
    import numpy as np
    import json
    import os 
    import re 
    
    # In[2]
    # !pwd
    os.chdir('./root/FAQ/')
    
    # In[2]
    with open('./data/all_data.txt', 'r') as f:
        data = f.read().split('[SEP]')
        AQ = pd.DataFrame(data, columns=['question'])
        
    # In[3]
    # 冬奥会类(可以回答),设置为正例(2)
    AQ['question'] = AQ['question'].str.strip()
    AQ['label'] = 2
    
    # In[3]
    AQ          # 冬奥会类, 后面只取一部分
    
    # In[4]
    # 数据集1(因为量少,多弄几个)
    # 体育-非东奥; 非体育类;
    f = open('./data/Negative.json', 'rb')
    line = f.read().decode('utf8', 'ignore')
    f.close()
    with open('./data/Negative.txt', 'w') as f:
        f.write('[' + ','.join(line.split()) + ']')
    
    # In[5]
    # 体育类-非东奥; 非体育类
    NoAQ = pd.read_json('./data/Negative.txt')
    NoAQ['title']
    
    # In[4]
    # 数据集2
    # 体育-非东奥; 非体育类; 
    f = open('./data/Negative02.json', 'rb')
    line2 = f.read().decode('utf8', 'ignore')
    f.close()
    with open('./data/Negative02.txt', 'w') as f:
        f.write('[' + ','.join(line2.split()) + ']')
    
    # In[5]
    NoAQ02 = pd.read_json('./data/Negative02.txt')
    NoAQ02['title']
    
    # 数据集3
    
    
    # In[5]
    NoAQ = NoAQ.append(NoAQ02)
    print(len(NoAQ))
    
    # In[6]
    # 285155
    # 285155
    train_len = len(NoAQ)
    AQ = AQ.iloc[:train_len]
    print('东奥(可回答): ', len(AQ))
    # 285155
    print('体育-非东奥 + 非体育类:', train_len) 
    AQ
    
    # In[7]
    # 筛选体育-非东奥;非体育类;
    # 改列名
    NoAQ = NoAQ.drop(labels=['answer', 'desc', 'url'], axis=1)
    NoAQ.columns = ['question']
    NoAQ
    
    # In[8]
    # 非体育类:0
    NoAQ['label'] = 0
    NoAQ
    
    # In[9]
    # 体育类,非东奥类:1
    sports = ['雪', '赢', '速', '跳', '滑', '冬', '自由', '冰', '剧烈', '开赛', 'vs', '武术', '奥运会', '健身', '跑步', '打球', '强', '壮', '体育', '运动员', '运动', '活动', '训练', '得分', '比赛', '参赛', '赢', '球']
    found = NoAQ['question'].str.contains('|'.join(sports))
    sports_idx = NoAQ['question'][found].index
    print(len(sports_idx))
    NoAQ['label'].iloc[sports_idx] = 1
    
    # In[9]
    NoAQ.loc[NoAQ['label'] == 1]
    NoAQ.loc[NoAQ['label'] == 0]
    # In[10]
    # 整合数据
    AQ = AQ.append(NoAQ)
    # In[11]
    AQ
    # In[11]
    AQ.to_csv('./data/FAQ.csv', sep='	')
    
    # In[12]
    test = pd.read_csv('./data/FAQ.csv', sep='	')
    test
    # %%
    
    
  • 相关阅读:
    购买云主机时应该注意哪些事项
    wdcp的安装方法与常见问题
    推荐一些不错的计算机书籍(php c mysql linux等等)
    .htaccess详解及.htaccess参数说明【转】
    常用的7个.htaccess代码组织某个国家的IP访问
    最新ecshop v2.7.3版本去版权完全版
    javascript常用数组算法总结
    10道javascript笔试题
    Magento显示多货币,Magento 多货币设置
    Magento后台手动修改订单状态方法及手动修改方法php
  • 原文地址:https://www.cnblogs.com/douzujun/p/14375660.html
Copyright © 2020-2023  润新知