• 统计文件夹下面的文本文件中频率最高的单词


    通过该练习,需要熟悉lambda在字典排序中的使用,可参考这篇内容http://www.cnblogs.com/kaituorensheng/archive/2012/08/07/2627386.html

     1 # coding:utf-8
     2 import re
     3 import os
     4 
     5 
     6 def get_myfiles(path):
     7 
     8     filepath = os.listdir(path)
     9     files = []
    10     for fp in filepath:
    11         # 将path路径下面的文件进行拼接
    12         fppath = path + '/' + fp
    13         fppattern = fppath.split('.')
    14         if os.path.isfile(fppath) and fppattern[2] == 'txt':
    15             files.append(fppath)
    16         elif os.path.isdir(fppath):
    17             files += get_myfiles(fppath)
    18     return files
    19 
    20 
    21 def get_word(files):
    22 
    23     worddict = {}
    24     for filename in files:
    25         f = open(filename, 'rb')
    26         s = f.read()
    27         words = re.findall(r'[a-z0-9A-z]+', s)
    28         for word in words:
    29             worddict[word] = worddict[word] + 1 if word in worddict else 1
    30         f.close()
    31     wordsort = sorted(worddict.items(), key=lambda e: e[1], reverse=True)
    32     return wordsort
    33 
    34 if __name__ == '__main__':
    35     files = get_myfiles('.')
    36     wordsort = get_word(files)
    37     num = 1
    38     # 因为wordsort已经倒序排列,因而只需要按照前面的几个值进行比对
    39     for i in range(len(wordsort)-1):
    40         if wordsort[i][1] == wordsort[i+1][1]:
    41             num += 1
    42         else:
    43             break
    44     # 通过用变量num记录最大值的个数,后面只需要根据num的值按顺序读取字典
    45     for i in range(num):
    46         print wordsort[i]

     下面通过构造类方法来实现该功能

    import re
    import io
    
    
    class Counter:
    
        def __init__(self, path):
            """
            :param path:文件路径
            """
            self.mapping = {}
            with io.open(path, 'r', encoding='utf-8') as f:
                content = f.read()
                words = [s.lower() for s in re.findall(r'w+', content)]
                for word in words:
                    self.mapping[word] = self.mapping.get(word, 0) + 1
    
        def most_num(self, n):
            assert n > 0, "n should be large than 0"
            return sorted(self.mapping.items(), key=lambda s: s[1], reverse=True)[:n]
    
    if __name__ == '__main__':
        most_num_5 = Counter('yes.txt').most_num(6)
        for item in most_num_5:
            print item
  • 相关阅读:
    Redis基础-基本数据类型
    C#特性
    C#反射
    Json序列化
    动态添加文本框并获取文本框的值
    iframe中镶嵌html页,并获取html页中的方法
    获取url中的参数
    发送邮件
    数据导入Excel表格
    处理xml模块、configparser模块、hashlib模块、subprocess模块
  • 原文地址:https://www.cnblogs.com/milian0711/p/7728915.html
Copyright © 2020-2023  润新知