• 阶段作业1:完整的中英文词频统计+补交上次作业


    #补交作业

    cc = ('''Counting stars Lately I've been, I've been losing sleep   
    Dreaming 'bout the things that we could be   
    But baby I've been, I've been prayin' hard     
    Said no more counting dollars   We'll be counting stars   
    Yeah, we'll be counting stars   I see this life Like a swinging vine  
     Swing my heart across the line   In my face is flashing signs   Seek it out and ye shall find
      Old, but I'm not that old   Young, but I'm not that bold   And I don't think the world is sold  
     I'm just doing what we're told   I, feel something so right   But doing the wrong thing   
    I, feel something so wrong   But doing the right thing   I could lie, could lie, could lie  
     everything that kills me makes me feel alive   Lately I've been, I've been losing sleep  
     Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard  
     Said no more counting dollars   We'll be counting stars   Lately I've been, I've been losing sleep   
    Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard   Said no more counting dollars  
     We'll be, we'll be counting stars   I feel the love And I feel it burn   Down this river every turn  
     Hope is a four letter word   Make that money   Watch it burn   Old, but I'm not that old  
     Young, but I'm not that bold   And I don't think the world is sold   I'm just doing what we're told  
     I, feel something so wrong   But doing the right thing   I could lie, could lie, could lie  
     Everything that drowns me makes me wanna fly   Lately I've been, I've been losing sleep  
     Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard
      Said no more counting dollars   We'll be counting stars   Lately I've been, I've been losing sleep  
     Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard  
     Said no more counting dollars   We'll be, we'll be counting stars   Take that money And watch it burn   Sink in the river
    ''')
    cc = cc.replace('.', ' ')
    ccList = cc.split()
    print(len(cc), ccList)  # 分隔一个单词并统计英文单词个数
    ccSet = set(ccList)  # 将列表转化成集合,再将集合转化成字典来统计每个单词出现个数
    
    print(ccSet)
    
    
    strDict = {}
    # for star in ccSet:
    #     strDict[star] = ccList.count(star)
    # print(strDict, len(strDict))
    for star in ccSet:
        strDict[star]=cc.count(star)
    for key in ccSet:
        print(key,strDict[key])
    wclist=list(ccSet.items())
    print(wclist)
    # def takeSecond(elem):
    #     return  elem[1]
    # wclist.sort(key=takeSecond,reverse=True)
    # print(wclist)
    
    #按词频排序
    wcList=list(strDict.items())
    print(wcList)
    wcList.sort(key=lambda x:x[1],reverse=True)
    print(wcList)
    
    #输出TOP(20)
    for i in range(20):
        print(wcList[i])
    
    
    # 列表的遍历
    
    cclist = ['wqdq', 'dqd', 'Awd', 313, '小四', 'dqd']
    print(cclist)
    cclist.append('gegeheh')
    print(cclist)
    cclist.pop(2)
    print(cclist)
    for i in cclist:
        print(i)
    
    # 元组的遍历
    
    tuple = ('jtfjhrr', 'rqfw f2q', 800, 10)
    print(tuple[2])
    for i in tuple:
        print(i)
    
    # 字典的遍历
    
    dic = {'fhehe': '4w6436', 'jgdns': 7, '4w6436': 'First'}
    
    print('fhehe:', dic['fhehe'])
    print('4w6436:', dic['4w6436'])
    
    dic['4w6436'] = 8;
    dic['4w6436'] = "对接欧文机房的维护"
    
    print('4w6436:', dic['4w6436'])
    print('4w6436:', dic['4w6436'])
    
    for key in dic:
        print(key, ':', dic.get(key))
    
    # 集合的遍历
    
    a = set([1, 2, 3, 6, 5])
    print(a)
    
    a.add(4)
    print(a)
    a.add('uteru')
    print(a)
    
    a.remove(5)
    print(a)
    
    for i in a:
        print(i)
    

      

    #此次作业

    fo=open('ccc1015.txt','r',encoding='utf-8')
    strBig=fo.read().lower()
    fo.close()
    print(strBig)
    #字符串预处理:#大小写,标点符号,特殊符号
    sep=""".,:;!?"""
    for ch in sep:
        strBig=strBig.replace(ch,'')
    strlist=strBig.split()
    print(len(strlist),strlist)
    strSet=set(strlist)
    exclude={'is','be','be','I','we','the','in'}
    strSet=strSet-exclude
    print(len(strSet),strSet)
    strDict={}
    for word in strSet:
        strDict[word]=strlist.count(word)
    print(len(strDict),strDict)
    #按词频排序
    wcList=list(strDict.items())
    print(wcList)
    wcList.sort(key=lambda x:x[1],reverse=True)
    print(wcList)
    
    #输出TOP(20)
    for i in range(20):
        print(wcList[i])
    
    
    
    
    # 中文版
    
    
    #读取文本文件
    f = open('shengxu.txt','r',encoding='utf-8')
    story = f.read()
    f.close()
    print(story)
    
    #预处理
    sep = ',。:“”?!'''     #符号处理
    for ch in sep:
        story=story.replace(ch,' ')   #利用for循环语句把特殊符号替换成空格
        print(story)
    
    #中文分词:结巴
    import jieba
    cnStr = story
    #精确模式
    print(list(jieba.cut(cnStr)))
    
    # 分隔提取单词
    strList = list(jieba.cut(cnStr))
    print(len(strList), strList)
    # 单词计数字典
    strSet = set(strList)
    print(len(strSet), strSet)
    strDict = {}
    for word in strSet:
        strDict[word] = strList.count(word)
        # print(len(strDict),strDict)
    # 词频排序
    wcList = list(strDict.items())
    # print(wcList)
    wcList.sort(key=lambda x: x[1], reverse=True)
    # print(wcList)
    
    # 输出TOP10
    for i in range(10):
        print(wcList[i])
    

      

      

     

  • 相关阅读:
    leetcode 300. 最长上升子序列
    JAVA基础系列:Arrays.binarySearch二分查找
    leetcode 674. 最长连续递增序列
    小红书:笔试题(棋盘最短路径,笔记本草稿栈,迷宫游戏)
    VIPKID:笔试题(数组中和为0的一对数的数量,十进制转二进制中1的个数)
    [******] 树问题:普通二叉树的创建与遍历
    [******] 链表问题:将单向链表按某值划分成左边小、中间相等、右边大的形式
    [******] java多线程连续打印abc
    快手:笔试题(版本号比较,平方和为1,合并两个流)
    京东:笔试题(合唱队找剩余的最小值,考场安排搬出的人数尽可能少)
  • 原文地址:https://www.cnblogs.com/cc013/p/9789856.html
Copyright © 2020-2023  润新知