• 爬虫大作业


    1.选一个自己感兴趣的主题。

    2.用python 编写爬虫程序,从网络上爬取相关主题的数据。

    3.对爬了的数据进行文本分析,生成词云。

    4.对文本分析结果进行解释说明。

    5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。

    6.最后提交爬取的全部数据、爬虫及数据分析源代码。

    # -*- coding: UTF-8 -*-
    
    import requests
    import json
    import re
    from bs4 import BeautifulSoup
    import jieba
    from PIL import Image
    import numpy as np
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud, ImageColorGenerator
    
    
    # 获取评论数
    def getCommentsCounts(newsurl):
        bianhao = re.search('doc-i(.+).shtml', newsurl)
        newsid = bianhao.group(1)
        comment = requests.get(commentURL.format(newsid))
        jd = json.loads(comment.text)
        counts = jd['result']['count']['total']
        return counts
    
    
    def getNewsDetail(newsurl):
        result = {}
        res = requests.get(newsurl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        # 获取标题
        result['title'] = soup.select(".main-title")[0].text
        # 来源
        result['newssources'] = soup.select('.source')[0].text
        # 时间
        result['timesource'] = soup.select('.date')[0].text
        # 编辑
        result['editor'] = soup.select('.show_author')[0].text.strip('责任编辑:')[-1]
        # 评论数
        result['comments'] = getCommentsCounts(url)
        # 内容
        result['contents'] = soup.select('.article')[0].text.strip()
        # writeNewsContent(content)
        return str(result['contents'])
    
    
    # 保为 txt
    def writeNewsContent(content):
        f = open('news.txt', 'a', encoding='utf-8')
        f.write(content)
        f.close()
    
    
    def parseListLinks(url):
        newsdetails = []
        res = requests.get(url)
        jss = res.text.lstrip('  newsloadercallback(').rstrip(');')
        jd = json.loads(jss)
        for news in jd['result']['data']:
            allURL = news['url']
            newsdetails.append(getNewsDetail(allURL).split())
        writeNewsContent(str(newsdetails))
        return newsdetails
    
    
    commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1
        &format=json&channel=gn&newsid=comos-{}&group=undefined&
        compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3'
    url = 'http://finance.sina.com.cn/chanjing/gsnews/2018-04-29/doc-ifzvpatq7964658.shtml'
    listURL = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&
    callback=newsloadercallback&_=1524705663198'
    news_total = []
    for i in range(1, 2):
        newssurl = listURL.format(i)
        newsary = parseListLinks(newssurl)
        news_total.extend(newsary)
    print(len(news_total)) < br > < br > < br >
    
    
    f = open('content.txt', 'r', encoding='utf-8')
    news = f.read()
    f.close()
    
    sep = ''',。‘’“”:;()!?、《》[] '''
    exclude = {'', '', '', '', '', ''}
    
    
    
    for c in sep:
        news = news.replace(c, ' ')
    wordList = list(jieba.cut(news))
    wordDict = {}
    words = list(set(wordList) - exclude)
    
    for w in range(0, len(words)):
        wordDict[words[w]] = news.count(str(words[w]))
    
    dictList = list(wordDict.items())
    dictList.sort(key=lambda x: x[1], reverse=True)
    cy = {}
    f = open('news.txt', 'a', encoding="utf-8")
    for i in range(1000):
        print(dictList[i])
        f.write(dictList[i][0] + ':' + str(dictList[i][1]) + '
    ')
        cy[dictList[i][0]] = dictList[i][1]
    f.close()
    
    font = r'C:WindowsFontswb.ttf'
    image = Image.open('./wordcloud.jpg')
    graph = np.array(image)
    wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph)
    wc.generate_from_frequencies(cy)
    image_color = ImageColorGenerator(graph)
    plt.imshow(wc)
    plt.axis("off")
    plt.show()

    生成的词云:

  • 相关阅读:
    withDefaultPasswordEncoder() 过时弃用问题
    @Value不能给静态变量直接赋值问题
    java编程思想之垃圾收集
    阅读java编程思想之一切都是对象
    阅读java编程思想的总结(一)
    Idea连接服务器docker并部署代码到docker实现一键启动
    后端设置Cookie前端跨域获取丢失问题(基于springboot实现)
    win10安装docker并结合Idea2018.1部署springboot项目
    Idea用maven给springboot打jar包
    css纯数字或字母换行
  • 原文地址:https://www.cnblogs.com/xuyizhu/p/8974371.html
Copyright © 2020-2023  润新知