阶段作业1：完整的中英文词频统计+补交上次作业

#补交作业

cc = ('''Counting stars Lately I've been, I've been losing sleep 　　
Dreaming 'bout the things that we could be 　　
But baby I've been, I've been prayin' hard 　 　　
Said no more counting dollars 　　We'll be counting stars 　　
Yeah, we'll be counting stars 　　I see this life Like a swinging vine 　
　Swing my heart across the line 　　In my face is flashing signs 　　Seek it out and ye shall find
　　Old, but I'm not that old 　　Young, but I'm not that bold 　　And I don't think the world is sold 　
　I'm just doing what we're told 　　I, feel something so right 　　But doing the wrong thing 　　
I, feel something so wrong 　　But doing the right thing 　　I could lie, could lie, could lie 　
　everything that kills me makes me feel alive 　　Lately I've been, I've been losing sleep 　
　Dreaming 'bout the things that we could be 　　Baby I've been, I've been prayin' hard 　
　Said no more counting dollars 　　We'll be counting stars 　　Lately I've been, I've been losing sleep 　　
Dreaming 'bout the things that we could be 　　Baby I've been, I've been prayin' hard 　　Said no more counting dollars 　
　We'll be, we'll be counting stars 　　I feel the love And I feel it burn 　　Down this river every turn 　
　Hope is a four letter word 　　Make that money 　　Watch it burn 　　Old, but I'm not that old 　
　Young, but I'm not that bold 　　And I don't think the world is sold 　　I'm just doing what we're told 　
　I, feel something so wrong 　　But doing the right thing 　　I could lie, could lie, could lie 　
　Everything that drowns me makes me wanna fly 　　Lately I've been, I've been losing sleep 　
　Dreaming 'bout the things that we could be 　　Baby I've been, I've been prayin' hard
　　Said no more counting dollars 　　We'll be counting stars 　　Lately I've been, I've been losing sleep 　
　Dreaming 'bout the things that we could be 　　Baby I've been, I've been prayin' hard 　
　Said no more counting dollars 　　We'll be, we'll be counting stars 　　Take that money And watch it burn 　　Sink in the river
''')
cc = cc.replace('.', ' ')
ccList = cc.split()
print(len(cc), ccList)  # 分隔一个单词并统计英文单词个数
ccSet = set(ccList)  # 将列表转化成集合，再将集合转化成字典来统计每个单词出现个数

print(ccSet)


strDict = {}
# for star in ccSet:
#     strDict[star] = ccList.count(star)
# print(strDict, len(strDict))
for star in ccSet:
    strDict[star]=cc.count(star)
for key in ccSet:
    print(key,strDict[key])
wclist=list(ccSet.items())
print(wclist)
# def takeSecond(elem):
#     return  elem[1]
# wclist.sort(key=takeSecond,reverse=True)
# print(wclist)

#按词频排序
wcList=list(strDict.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)

#输出TOP(20)
for i in range(20):
    print(wcList[i])


# 列表的遍历

cclist = ['wqdq', 'dqd', 'Awd', 313, '小四', 'dqd']
print(cclist)
cclist.append('gegeheh')
print(cclist)
cclist.pop(2)
print(cclist)
for i in cclist:
    print(i)

# 元组的遍历

tuple = ('jtfjhrr', 'rqfw f2q', 800, 10)
print(tuple[2])
for i in tuple:
    print(i)

# 字典的遍历

dic = {'fhehe': '4w6436', 'jgdns': 7, '4w6436': 'First'}

print('fhehe:', dic['fhehe'])
print('4w6436:', dic['4w6436'])

dic['4w6436'] = 8;
dic['4w6436'] = "对接欧文机房的维护"

print('4w6436:', dic['4w6436'])
print('4w6436:', dic['4w6436'])

for key in dic:
    print(key, ':', dic.get(key))

# 集合的遍历

a = set([1, 2, 3, 6, 5])
print(a)

a.add(4)
print(a)
a.add('uteru')
print(a)

a.remove(5)
print(a)

for i in a:
    print(i)

#此次作业

fo=open('ccc1015.txt','r',encoding='utf-8')
strBig=fo.read().lower()
fo.close()
print(strBig)
#字符串预处理：#大小写,标点符号，特殊符号
sep=""".,:;!?"""
for ch in sep:
    strBig=strBig.replace(ch,'')
strlist=strBig.split()
print(len(strlist),strlist)
strSet=set(strlist)
exclude={'is','be','be','I','we','the','in'}
strSet=strSet-exclude
print(len(strSet),strSet)
strDict={}
for word in strSet:
    strDict[word]=strlist.count(word)
print(len(strDict),strDict)
#按词频排序
wcList=list(strDict.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)

#输出TOP(20)
for i in range(20):
    print(wcList[i])




# 中文版


#读取文本文件
f = open('shengxu.txt','r',encoding='utf-8')
story = f.read()
f.close()
print(story)

#预处理
sep = '，。：“”？！'''     #符号处理
for ch in sep:
    story=story.replace(ch,' ')   #利用for循环语句把特殊符号替换成空格
    print(story)

#中文分词：结巴
import jieba
cnStr = story
#精确模式
print(list(jieba.cut(cnStr)))

# 分隔提取单词
strList = list(jieba.cut(cnStr))
print(len(strList), strList)
# 单词计数字典
strSet = set(strList)
print(len(strSet), strSet)
strDict = {}
for word in strSet:
    strDict[word] = strList.count(word)
    # print(len(strDict),strDict)
# 词频排序
wcList = list(strDict.items())
# print(wcList)
wcList.sort(key=lambda x: x[1], reverse=True)
# print(wcList)

# 输出TOP10
for i in range(10):
    print(wcList[i])

相关阅读:
leetcode 300. 最长上升子序列
 JAVA基础系列：Arrays.binarySearch二分查找
 leetcode 674. 最长连续递增序列
 小红书：笔试题（棋盘最短路径，笔记本草稿栈，迷宫游戏）
VIPKID:笔试题（数组中和为0的一对数的数量，十进制转二进制中1的个数）
[******] 树问题：普通二叉树的创建与遍历
 [******] 链表问题：将单向链表按某值划分成左边小、中间相等、右边大的形式
 [******] java多线程连续打印abc
快手：笔试题（版本号比较，平方和为1，合并两个流）
京东：笔试题（合唱队找剩余的最小值，考场安排搬出的人数尽可能少）
原文地址：https://www.cnblogs.com/cc013/p/9789856.html