• 获取全部校园新闻


    1.取出一个新闻列表页的全部新闻 包装成函数。

    2.获取总的新闻篇数,算出新闻总页数。

    3.获取全部新闻列表页的全部新闻详情。

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import locale
    import re
    
    
    def getClickCount(newsUrl):
        newsid = re.search(r"\_(.*).html", newsUrl).group(1)[-4:]
        clicktimesurl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid)
        clicktimes = int(requests.get(clicktimesurl).text.split(".html(")[-1].lstrip("'").rstrip("');"))
        return clicktimes
    
    
    def getNewsDetail(newsUrl):
        resdet = requests.get(newsUrl)
        resdet.encoding = 'utf-8'
        soupdet = BeautifulSoup(resdet.text, 'html.parser')
        contentdetail = soupdet.select('#content')[0].text
        showinfo = soupdet.select('.show-info')[0].text
        date = showinfo.lstrip("发布时间:")[:19]
        if (showinfo.find('作者') > 0):
            author = re.search('作者:((.{2,4}s|.{2,4}、|.{2,4},|w*s){1,5})', showinfo).group(1)
        else:
            author = 'none'
        if (showinfo.find('审核') > 0):
            checker = re.search('审核:((.{2,4}s|.{2,4}、|.{2,4},|w*s){1,5})', showinfo).group(1)
        else:
            checker = 'none'
        if (showinfo.find('来源') > 0):
            source = re.search('来源:(.*)s*来|摄|点', showinfo).group(1)
        else:
            source = 'none'
        if (showinfo.find('摄影') > 0):
            photographer = re.search('摄影:(.*)s点', showinfo).group(1)
        else:
            photographer = 'none'
        clicktimes = getClickCount(newsUrl)
        dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
        print("发表时间:{0}  作者:{1}  审核:{2}  来源:{3}  摄像:{4}  点击次数:{5} 次".format(dateTime, author, checker, source, photographer,
                                                                            clicktimes))
        print(contentdetail)
    
    
    def getListDetail(ListPageUrl):
        resl = requests.get(ListPageUrl)
        resl.encoding = 'utf-8'
        soupl = BeautifulSoup(resl.text, 'html.parser')
        for news in soupl.select('li'):
            if len(news.select('.news-list-title')) > 0:
                title = news.select('.news-list-title')[0].text
                description = news.select('.news-list-description')[0].text
                info = news.select('.news-list-info')[0].text
                address = news.select('a')[0]['href']
                print("
    标题: {0}
    描述: {1}
    信息: {2}
    链接: {3}".format(title, description, info, address))
                getNewsDetail(address)
    
    
    locale.setlocale(locale.LC_CTYPE, 'chinese')
    Listurl = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res = requests.get(Listurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    ListCount = int(soup.select('.a1')[0].text.rstrip(''))
    if (ListCount % 10 > 0):
        pagecount = ListCount // 10 + 1
    else:
        pagecount = ListCount // 10
    for i in range(1, pagecount + 1):
        if (i == 1):
            ListPageUrl = Listurl
        else:
            ListPageUrl = Listurl + '{}.html'.format(i)
        getListDetail(ListPageUrl)

    4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

    import requests
    from bs4 import BeautifulSoup
    import jieba
    
    
    def getnewsdetail(newsurl):
        resd = requests.get(newsurl)
        resd.encoding = 'gbk'
        soupd = BeautifulSoup(resd.text, 'html.parser')
        total = len(soupd.select(".text"))
        content = ''
        for p in range(0, total):
            content += soupd.select('.text')[p].text + '
    '
        # 有部分为纯图片新闻所以加此判断语句来不分析纯图片新闻
        if (total > 0):
            print(content + "
    词频统计如下:")
            delword = ['', '', '', '', '', '', '-', '', '我们', '', '', '', '', '', '', '', '',
                       '
    ', '', '', '', '', '', '', '', '', '', '.', '', '', '', '', ' ', '', '', '']
            wordDict = {}
            newscontent = list(jieba.cut(content))
            wordset = set(newscontent) - set(delword)
            for i in wordset:
                wordDict[i] = newscontent.count(i)
            sort = sorted(wordDict.items(), key=lambda item: item[1], reverse=True)
            for i in range(20):
                print(sort[i])
        else:
            print('纯图片新闻')
    
    
    def getnewslist(newsurl):
        res = requests.get(newsurl)
        res.encoding = 'gbk'
        soup = BeautifulSoup(res.text, 'html.parser')
        for newsList in soup.select('.newslist')[0].select('li'):
            title = newsList.select('a')[0].text
            publishtime = newsList.select('.pub_time')[0].text
            address = newsList.select('a')[0]['href']
            print('
    标题:{0}
    发表时间:{1}
    新闻链接:{2}
    '.format(title, publishtime, address))
            getnewsdetail(address)
    
    
    # 添加自定义词汇
    jieba.add_word('维斯塔潘')
    jieba.add_word('维特尔')
    jieba.add_word("范多恩")
    jieba.add_word("加斯利")
    jieba.add_word("托斯特")
    jieba.add_word("小红牛")
    jieba.add_word("大红牛")
    jieba.add_word("库比卡")
    jieba.add_word("马格努森")
    jieba.add_word("倍耐力")
    
    url = "http://sports.qq.com/l/f1/allf1news/list20100311191657.htm"
    getnewslist(url)
    for i in range(1, 101):
        if (i == 1):
            getnewslist(url)
        else:
            newsurl = "http://sports.qq.com/l/f1/allf1news/list20100311191657_{}.htm".format(i)
            getnewslist(newsurl)
  • 相关阅读:
    防雪崩利器:熔断器 Hystrix 的原理与使用
    SpringBoot返回结果为null或空值不显示处理方法
    Tomca原理分析之责任链
    TOMCAT原理详解及请求过程
    RocketMQ支持事务消息机制
    ubuntu 安装rocketmq
    Monkey安装与配置教程
    Monkey通过安装包获取包名
    git使用笔记
    Linux解压命令
  • 原文地址:https://www.cnblogs.com/BennyKuang/p/8778197.html
Copyright © 2020-2023  润新知