• 一个完整的大作业


    一、选一个自己感兴趣的主题。

      对新浪新闻的时政页面进行爬取,分析今天一天的新闻关键词是什么。

    二、对新浪页面进行分析

      的到新浪页面的URL地址为:http://www.sohu.com/c/8/1460.html,需要得到的页面的每条新闻的链接地址,在a标签页面下用正则表达式进行过滤,得到每条新闻的链接地址。

    抓取每条新闻的链接地址:

    #访问搜狐新闻时政界面第一页,将界面的每条新闻的地址链接挑选出来
    html = requests.get('http://www.sohu.com/c/8/1460.html')
    html.encoding = 'utf-8'
    soup = BeautifulSoup(html.text,'html.parser')
    news_list = soup.find_all('a',href=re.compile(r'//www.sohu.com/'))
    news_urls = set()
    for url in news_list:
        news_url = url['href']
        news_urls.add(news_url)

    三、对抓取到的每个链接进行访问并下载页面

      分析出每个界面的标题是拥有text-title类的div标签,每个界面的新闻是拥有article类的article标签。并将下载的新闻放到news字符串里面

    news=""
    
    for url in news_urls:
        html = requests.get('http:' + url)
        html.encoding = 'utf-8'
        soup = BeautifulSoup(html.text,'html.parser')
        title=soup.find('div',class_='text-title').find('h1')
        content=soup.find('article',class_='article')
        news = news + content.get_text()

    四、对抓取出来的所有新闻进行文本分析及过滤。

    ex = [chr(x) for x in range(32,125)]
    for ch in ex:
        news = news.replace(ch,'')
    exchi= ['','','','','','','','','','','','','','','','','','{','}']
    for ch in exchi:
        news = news.replace(ch,'')
    
    print(news,"
    
    
    
    
    
    
    ")
    #对新闻进行文本分析
    
    ls = []
    words = jieba.lcut(news)
    counts = {}
    for word in words:
        if len(word) == 1:
            continue
        
        else:
            counts[word] = counts.get(word,0)+1
            ls.append(word)
    
    items = list(counts.items())
    items.sort(key = lambda x:x[1], reverse = True)
    for i in range(10):
        word , count = items[i]
        print ("{:<10}{:>5}".format(word,count))
    
    ex =['
    ','','','','','','','','u3000',' ','n',
         '','','','','','','','','{','}','原标题','']
    
    for ke in ls:
        if ke in ex:
            ls.remove(ke)

    五、制作词云

    wz = open('ms.txt','w+')
    wz.write(str(ls))
    wz.close()
    
    wz = open('ms.txt','r').read()
    backgroud_Image = plt.imread('apple.jpg')
    wc = WordCloud( background_color = 'white',    # 设置背景颜色
                    mask = backgroud_Image,        # 设置背景图片
                    max_words = 2000,            # 设置最大现实的字数
                    stopwords = STOPWORDS,        # 设置停用词
                    font_path = 'C:/Users/Windows/fonts/msyh.ttf',# 设置字体格式,如不设置显示不了中文
                    max_font_size = 200,            # 设置字体最大值
                    random_state = 30,            # 设置有多少种随机生成状态,即有多少种配色方案
                    )
    
    wc.generate(wz)
    image_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func = image_colors)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()

    六、输出结果

      利用jieba进行文本分析的结果。

      词云的输出结果。

    七、程序源代码

    import re
    import requests
    from bs4 import BeautifulSoup
    import jieba
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
    
    #访问搜狐新闻时政界面第一页,将界面的每条新闻的地址链接挑选出来
    html = requests.get('http://www.sohu.com/c/8/1460.html')
    
    html.encoding = 'utf-8'
    
    soup = BeautifulSoup(html.text,'html.parser')
    news_list = soup.find_all('a',href=re.compile(r'//www.sohu.com/'))
    news_urls = set()
    for url in news_list:
        news_url = url['href']
        news_urls.add(news_url)
    
    #逐一访问刚刚爬取的新闻界面,并将界面的新闻下载
    news=""
    
    for url in news_urls:
        html = requests.get('http:' + url)
        html.encoding = 'utf-8'
        soup = BeautifulSoup(html.text,'html.parser')
        title=soup.find('div',class_='text-title').find('h1')
        content=soup.find('article',class_='article')
        news = news + content.get_text()
    
    ex = [chr(x) for x in range(32,125)]
    for ch in ex:
        news = news.replace(ch,'')
    exchi= ['','','','','','','','','','','','','','','','','','{','}']
    for ch in exchi:
        news = news.replace(ch,'')
    
    #对新闻进行文本分析
    
    ls = []
    words = jieba.lcut(news)
    counts = {}
    for word in words:
        if len(word) == 1:
            continue
        
        else:
            counts[word] = counts.get(word,0)+1
            ls.append(word)
    
    items = list(counts.items())
    items.sort(key = lambda x:x[1], reverse = True)
    for i in range(10):
        word , count = items[i]
        print ("{:<10}{:>5}".format(word,count))
    
    ex =['
    ','','','','','','','','u3000',' ','n',
         '','','','','','','','','{','}','原标题','']
    
    for ke in ls:
        if ke in ex:
            ls.remove(ke)
    
    
    #制作词云
    wz = open('ms.txt','w+')
    wz.write(str(ls))
    wz.close()
    
    wz = open('ms.txt','r').read()
    
    backgroud_Image = plt.imread('apple.jpg')
    wc = WordCloud( background_color = 'white',    # 设置背景颜色
                    mask = backgroud_Image,        # 设置背景图片
                    max_words = 2000,            # 设置最大现实的字数
                    stopwords = STOPWORDS,        # 设置停用词
                    font_path = 'C:/Users/Windows/fonts/msyh.ttf',# 设置字体格式,如不设置显示不了中文
                    max_font_size = 200,            # 设置字体最大值
                    random_state = 30,            # 设置有多少种随机生成状态,即有多少种配色方案
                    )
    
    wc.generate(wz)
    image_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func = image_colors)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
  • 相关阅读:
    巴洛克式和哥特式的区别
    推荐阅读书籍,是时候再行动起来了。
    AtCoder ABC 159F Knapsack for All Segments
    AtCoder ABC 159E Dividing Chocolate
    AtCoder ABC 158F Removing Robots
    AtCoder ABC 158E Divisible Substring
    AtCoder ABC 157F Yakiniku Optimization Problem
    AtCoder ABC 157E Simple String Queries
    AtCoder ABC 157D Friend Suggestions
    AtCoder ABC 156F Modularness
  • 原文地址:https://www.cnblogs.com/xypbk/p/7768200.html
Copyright © 2020-2023  润新知