短文本情感分类（一）

# coding:utf-8
# 将文本分词处理
import jieba

stoplist = {}.fromkeys([ line.strip() for line in open('/Test/orgindata/stopwords.txt') ])

input = open('/Test/orgindata/corpus.txt','r')
output = open('/Test/process2/corpus-seg.txt','w+')

line = input.readline()
index = 0
text = ''
while line!=None and len(line) > 4:
    #去除头部和尾部的<content> </content>
    line = line[9:-11]

    # segments = thu1.cut(line, text=True)
    segments = jieba.cut(line)

    # segments = segments.split(' ')
    segments = [word for word in list(segments) if word not in stoplist]

    result = ''
    for segment in segments:
        if len(segment)>1:
            result  += segment + ' '
    line = input.readline()
    if len(result) > 4:
        text += result
        index += 1
        if index%100 == 0:
            output.write(text.encode('utf-8') + '
')
            text = ''
            print('line '+str(index))

print '处理完成'

相关阅读:
【Java-Web】初始化加载Serlvet工程后-HttpServlet报错
【GIS】Cesium GLTF
【视频】ffmpeg mov mp4 m3u8 ts
【GIS】ArcGIS Server密码
【NodeJS】http-server.cmd
【GIS】Cesium回到初始位置
【GIS】postgres（postgis） --》nodejs+express --》geojson --》leaflet
深入浅出数据仓库中SQL性能优化之Hive篇
大数据时代的技术hive：hive介绍
hive优化之------控制hive任务中的map数和reduce数

原文地址：https://www.cnblogs.com/mengxingxinqing/p/8026990.html