• Python大数据:jieba 中文分词,词频统计


    # -*- coding: UTF-8 -*-
    import sys
    import numpy as np
    import pandas as pd
    import jieba
    import jieba.analyse
    import codecs
    
    #设置pd的显示长度
    pd.set_option('max_colwidth',500)
    
    #载入数据
    rows=pd.read_csv('datas1.csv', header=0,encoding='utf-8',dtype=str)
    #载入停用词
    jieba.analyse.set_stop_words('stoped.txt')
    
    # 保存全局分词,用于词频统计
    segments = []
    # 保存每行分词,用于关联分析
    results = []
    index = 0
    
    for row in rows.index:
        content = rows[index:index+1]['content'].to_string()
        # 分词操作
        # words = jieba.cut(content)
        # TF-IDF关键词抽取,关键词数量设置为topK,不能过滤标点和数字
        #words = jieba.analyse.extract_tags(content, topK=20)
        #TextRank 关键词抽取,只获取固定词性
        words = jieba.analyse.textrank(content, topK=20,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
        splitedStr = ''
        for word in words:
            # 记录全局分词
            segments.append({'word':word, 'count':1})
            splitedStr += word + ' '
        # 记录行结果
        results.append({'text':content, 'words': splitedStr})
        index = index + 1
    
    # 将结果数组转为df序列
    dfSg = pd.DataFrame(segments)
    
    # 词频统计
    dfWord = dfSg.groupby('word')['count'].sum()
    
    #导出csv
    dfWord.to_csv('keywords.csv',encoding='utf-8')
    
    dfRet = pd.DataFrame(results)
    dfRet.to_csv('result.csv',encoding='utf-8')
  • 相关阅读:
    为什么Android的图片质量会比iPhone的差?
    第二次作业-编程
    印像最深的三位老师
    c++的类型转换(转)
    SpringMVC入门详解
    java基础
    springmvc
    Maven配置下载包速度更快
    spring中的bean.xml
    pom.xml依赖
  • 原文地址:https://www.cnblogs.com/blackice/p/8614430.html
Copyright © 2020-2023  润新知