• 综合练习:词频统计


    f = open('test.txt','r')
    news = f.read()
    f.close()
    
    sep = '''.,'?!:"'''
    exclude = {'the','and','to','a','of','was','on','with','i','s','is','were','that','back','at','little','have'}
    for w in sep:
        news = news.replace(w,' ')
    
    wordList = news.lower().split()
    wordDict = {}
    '''
    for v in wordList:
        wordDict[v] = wordDict.get(v, 0)+1
    for v in exclude
        del( wordDict[v])
    '''
    
    wordset = set(wordList) - exclude
    for v in wordset:
        wordDict[v] = wordList.count(v)
    
    dictList = list(wordDict.items())
    dictList.sort(key=lambda x:x[1],reverse=True)
    for i in range(20):
        print(dictList[i])
    
    f = open('newscount.txt','a')
    for i in range(25):
        f.write(dictList[i][0]+' '+str(dictList[i][1])+'
    ')
    

    运行结果:

    中文统计

    import jieba
    
    f = open('text.txt', 'r', encoding = 'utf-8')
    news = f.read()
    f.close()
    
    sep=''',。‘’“”:;()!?、《》 '''
    exclude={'我', '在', '不', '一', '了', '那', '是', '来', '他', '个', '行', '你', '的',
         '者','有','
    ','-','出','这','时','没','她','到','上','们','会','着','说','要'
        , '为','过','看','得','里','克','去','想','好','天','小','后','地','么','都'
        , '还','以','对','能','大','也','很','而','然','下','但','吕','把','开','从'
        , '让','就','一个','可','点','跟','样','向','事','起','中','面'}
    
    for c in sep:
        news = news.replace(c,' ')
    wordList=list(jieba.cut(news))
    wordDict={}
    words=list(set(wordList)-exclude)
    
    for w in range(0,len(words)):
        wordDict[words[w]]=news.count(str(words[w]))
    
    dictList = list(wordDict.items())
    dictList.sort(key=lambda x:x[1],reverse=True)
    
    f = open('new.txt', 'a',encoding="utf-8")
    for i in range(20):
        f.write(dictList[i][0] + ':' + str(dictList[i][1]) + '
    ')
    f.close()
    

     结果:

  • 相关阅读:
    [leedcode 104] Maximum Depth of Binary Tree
    [leedcode 103] Binary Tree Zigzag Level Order Traversal
    [leedcode 102] Binary Tree Level Order Traversal
    [leedcode 101] Symmetric Tree
    [leedcode 100] Same Tree
    [leedcode 99] Recover Binary Search Tree
    深入理解java虚拟机---内存分配策略(十三)
    jmeter4.0 源码编译 二次开发
    jmeter源码导入eclipse并执行
    深入理解java虚拟机---垃圾回收(十一)
  • 原文地址:https://www.cnblogs.com/cgq520/p/8658573.html
Copyright © 2020-2023  润新知