import os
import re
import jieba
import requests
if not os.path.exists('网易新闻'):
os.mkdir('网易新闻')
count = 0
str_ = ''
for i in ['nba']:
# 获取所有的url
response = requests.get(f'https://sports.163.com/{i}/')
data = response.text
url_res = re.findall('href="(https://sports.163.com/.*?)"', data)
url_res = set(url_res)
# 针对单个url
for url in url_res:
url_response = requests.get(url)
url_data = url_response.text
try:
title = re.findall('<h1>(.*?)</h1>', url_data, re.S)[0]
news_res =
re.findall(
'<div class="post_text" id="endText" style="border-top:1px solid #ddd;">(.*?责任编辑:.*?)</span>',
url_data, re.S)[0] #
news_res = re.sub('<.*?>', '', news_res)
except:
continue
title = re.sub('[!"#$%&()*+,-./:;<=>?@[\]^_‘{|}~,…]|s', '', title) # 除掉标题所有的脏字符
title_path = os.path.join('网易新闻', f'{title}.txt') # 拼接出新闻的路径
# f = open(title_path, 'w', encoding='utf8')
# f.write(news_res)
# f.flush()
# f.close()
count += 1
str_ += news_res
print(f'完成{count}篇, {title} done...')
res = jieba.lcut(str_)
dic = {}
for i in res:
if len(i) == 1:
continue
if i not in dic:
dic[i] = 1
else:
dic[i] += 1
dic_list = list(dic.items())
def func(i):
return i[1]
dic_list.sort(key = func)
dic_list.reverse()
new_str = ''
for i in dic_list[:20]:
new_str += f'{i[0]},'
print(i)
import wordcloud
w = wordcloud.WordCloud(font_path=r'C:WindowsFonts等线Deng')
w.generate(new_str)
w.to_file('网易新闻.png')