• 综合练习:词频统计


    1.英文词频统

    代码如下:

    f = open('lyric.txt','r')
    lyric = f.read()
    f.close()
    
    
    punctuation = ''',.?/:;'"'''
    a = {'in','on','with','by','for','at','about','under','of','i','a','is','its','so','and','dont','it','to','ill','the'}
    for i in punctuation:
        lyric = lyric.replace(i,'')
    result = lyric.lower().lstrip().rstrip()
    tempwords = result.split()
    print(tempwords)
    count = {}
    words = list(set(tempwords)-a)
    
    print(words)
    print(result)
    
    for i in range(0,len(words)):
        count[words[i]]=result.count(str(words[i]))
        print('单词  '+ words[i] + ' 的出现次数为:'+str(result.count(words[i])))
    
    for i in count:
        print(i)
        print(count[i])
    
    countList = list(count.items())
    countList.sort(key=lambda x:x[1],reverse=True)
    print(countList)
    
    f = open('lyricCount.txt','a')
    for i in range(20):
        f.write(countList[i][0]+':'+str(countList[i][1])+'
    ')
    f.close()
    

      运行结果图:

    2.中文词频统计

    代码如下

    import jieba
    
    
    f = open('sanguoyanyi.txt', 'r',encoding='utf-8')
    text = f.read()
    f.close()
    
    jieba.add_word('曹操')
    jieba.add_word('诸葛亮')
    jieba.add_word('孔明')
    punctuation = ''',。‘’“”:;()!?、 '''
    a = {'的','
    ','u3000','曰','之','不','人','军','操','一','将',
         '大','马','来','德','有','于','下','兵','此',
         '玄','公','见','为','何','中','而','可','吾',
         '出','也','以','与','上','后','今','其','去',
         '日','明','言'}
    for i in punctuation:
        text = text.replace(i, '')
    print(list(jieba.cut(text)))
    tempwords = list(jieba.cut(text))
    print(tempwords)
    count = {}
    words = list(set(tempwords) - a)
    print(words)
    
    
    for i in range(0, len(words)):
        count[words[i]] = text.count(str(words[i]))
    
    
    countList = list(count.items())
    countList.sort(key=lambda x: x[1], reverse=True)
    print(countList)
    
    f = open('zzzCount.txt', 'a')
    for i in range(20):
        f.write(countList[i][0] + ':' + str(countList[i][1]) + '
    ')
    f.close()
    

     运行结果图:

  • 相关阅读:
    音频可视化
    accunulate
    node
    js 中编码(encode)和解码(decode)的三种方法
    ES6高阶 同步和异步 回调地狱 promise async和await
    区间dp
    树形dp1
    背包dp(多重)
    树形dp
    背包dp(完全)
  • 原文地址:https://www.cnblogs.com/zzrf/p/8658484.html
Copyright © 2020-2023  润新知