import requests from bs4 import BeautifulSoup import jieba import matplotlib.pyplot as plt from scipy.misc import imread from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator def get_url(urls): for n in range(0, 100): url = 'https://news.cnblogs.com/n/page/' + str(n) + '/' urls.append(url) return urls def get_info(url, content): res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') news = soup.select('div[class="content"] h2') for nn in news: content.append(nn.get_text().strip(' ').split(' ')) return content urls = [] single_content =[] all_content = [] # urls = get_url(urls) # for u in urls: # all_content.append(get_info(u, single_content)) # name = open('blog.txt', 'w', encoding='utf-8') # for cc in all_content[0]: # name.write(str(cc[0]) + ' ') # name.close() def jieba_split(): with open('blog.txt', encoding='utf-8') as f: comment_text = f.read() cut_text = " ".join(jieba.cut(comment_text)) with open('blog_split.txt', 'w', encoding='utf-8') as f: f.write(cut_text) # jieba_split() def wordcouter(): word_lists = [] with open('blog_split.txt', 'r', encoding='utf-8') as f: words = f.readlines() for ww in words: s_word= list(jieba.cut(ww)) for word in s_word: word_lists.append(word) word_lists_set = set(list(word_lists)) length = len(word_lists_set) k = 1 couter = [] for w in word_lists_set: couter.append(w + u':' + str(word_lists.count(w)) + u"次 ") k += 1 with open('counter.txt', 'w', encoding='utf-8') as f: f.writelines(couter) # wordcouter() def word_cloud(): s_words = open('counter.txt', 'r', encoding='utf-8').read() words = jieba.cut(s_words, cut_all=True) words_split = " ".join(words) print(words_split) background_pic = imread('hellokity.JPG') word_c = WordCloud( width=1000, height=1000, margin=2, background_color='white', mask=background_pic, font_path='C:WindowsFontsSTZHONGS.TTF', stopwords=STOPWORDS, max_font_size=100, random_state=100 ) word_c.generate_from_text(words_split) word_c.to_file('kity.JPG') word_cloud()