搜狗新闻原始数据处理

简介：

下载的是搜狗新闻一个月版本的SogouCS.reduced，大约698M，包含128个txt文件

主要处理包括：转码，提取content和URL

处理之前：

每个文件中每条内容如下xml格式：

<doc>
<url>http://sports.sohu.com/20080627/n257795172_4.shtml</url>
<docno>215799a267c29427-71013306c0bb3300</docno>
<contenttitle>组图：蕊蕊拦网薛明暴扣　陈忠和发布会笑逐颜开</contenttitle>
<content>跳转至：Ｒ常担２６Ｎ依此盗骄潺１本┦奔洌对拢玻啡眨２００８年世界女排大奖赛第二周比赛继续进行，在中国香港站的一场焦点大战中，中国女排苦战五局，以３－２（２５－１８、２５－２７、２１－２５、２５－２１、１５－１３）击败了不久前在瑞士女排精英赛３－１战胜过自己的古巴女排，赢得中国香港站开门红。图为比赛精彩画面。＃ㄔ鹑伪嗉：王燕芳）＞彩图片</content>
</doc>

处理之后：

共：15类别。数据分布不均匀，猜测和各类新闻热度有关。

后续分析：

待补充

代码如下：

（1）包括转码和提取数据

# -*- coding: utf-8 -*-

'''
该脚本用于将搜狗语料库新闻语料
转化为按照URL作为类别名、
content作为内容的txt文件存储
'''
import os
import re
    
'''字符数小于这个数目的content将不被保存'''
threh = 30

'''获取原始语料文件夹下文件列表'''
def listdir_get(path, list_name):
    """
    :desc: get data of raw data
    :input: data of dir, list of slice data path
    """
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        if os.path.isdir(file_path):
            listdir_get(file_path, list_name)
        else:
            list_name.append(file_path)


'''
#修改文件编码为utf-8
from chardet import detect
def code_transfer(list_name):
    for fn in list_name: 
        with open(fn, 'rb+') as fp:
            content = fp.read()
            codeType = detect(content)['encoding']
            content = content.decode(codeType, "ignore").encode("utf8")
            fp.seek(0)
            fp.write(content)
            print(fn, "：已修改为utf8编码")
'''
def processing(list_name):
    
    '''对每个语料'''
    for path in list_name:
        print(path+'---start---')
        file = open(path, 'rb').read().decode("utf8")

        '''
        正则匹配出url和content
        '''
        patternURL = re.compile(r'<url>(.*?)</url>', re.S)
        patternCtt = re.compile(r'<content>(.*?)</content>', re.S)

        classes = patternURL.findall(file)
        contents = patternCtt.findall(file)

        '''将内容小于30的去除'''
        for i in reversed(range(contents.__len__())): 
            #如果是reversed (len(range(5))),这种索引是按从大到小的顺序排列，
            #列表不要随便删除，python会自动增补，导致索引变少
            if len(contents[i]) < threh:
                contents.pop(i)
                classes.pop(i) 

        '''进一步取出URL作为样本标签'''
        for i in range(classes.__len__()):
            patterClass = re.compile(r'http://(.*?).sohu.com/', re.S)
            classi = patterClass.findall(classes[i])
            classes[i] = classi[0]
            
        '''按照URL作为类别存储到处理后文件夹'''
        for i in range(len(classes)):
            file = data_original_path + '\processed\' + classes[i] + '.txt'
            with open(file, 'a+', encoding='utf-8')as f:
                f.write(contents[i]+'
')
        print(path+'---success---')
   
if __name__=='__main__':
    print("----tast start----")
    #原始语料路径
    data_original_path = "D:\software_study\nlp_data\SogouCS.reduced\"
    #data_original_path = './SogouCS.reduced/'

    #获取文件路径
    list_name = []
    listdir_get(data_original_path,list_name)
    
    #修改编码
    #code_transfer(listname)
    processing(list_name)
    
    print('----task success----')

（2）主要是转码，本人在实际中分开进行的

#-*- coding:utf-8 -*-
import os
from chardet import detect

data_original_path = "D:\software_study\nlp_data\SogouCS.reduced"

'''生成原始语料文件夹下文件列表'''
def listdir(path, list_name):
    """
    :desc: get data of raw data
    :input: data of dir, list of slice data path
    """
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        if os.path.isdir(file_path):
            listdir(file_path, list_name)
        else:
            list_name.append(file_path)

'''获取所有语料'''
list_name = []

listdir('D:\software_study\nlp_data\SogouCS.reduced\',list_name)
print(list_name)
for fn in list_name:
    with open(fn, 'rb+') as fp:
        content = fp.read()
        codeType = detect(content)['encoding']
        content = content.decode(codeType, "ignore").encode("utf8")
        fp.seek(0)
        fp.write(content)
        print(fn, "：已修改为utf8编码")

edit by Strangewx

相关阅读:
（八）断路器-Hystrix
WINDOWS SERVER 2012 虚拟机忘记密码后
 IIS FTP ：在组合的密钥属性“users，roles，permissions”分别设置为“*，Read，Write”时，无法添加类型为“add”的重复集合项
 log4j 日志组件
 IDEA缓存
 com.alibaba.druid.pool.DruidDataSource
EHCache CacheManager
webservice调试(XML参数) Wizdler PostMan
jar类库加载顺序
 JAXB工具
原文地址：https://www.cnblogs.com/strangewx/p/10212572.html