• 获取全部校园新闻


    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re

    #获取点击次数
    def getClickCount(newsUrl):
    newId=re.search('\_(.*).html',newsUrl).group(1).split('/')[1]
    clickUrl="http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(newId)
    clickStr = requests.get(clickUrl).text
    count = re.search("hits').html('(.*)');", clickStr).group(1)
    return count

    #获取新闻详细信息
    def getNewsDetail(newsurl):
    resd=requests.get(newsurl)
    resd.encoding='utf-8'
    soupd=BeautifulSoup(resd.text,'html.parser')
    title=soupd.select('.show-title')[0].text
    info=soupd.select('.show-info')[0].text
    dt=datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S')

    if info.find('来源')>0:
    source =info[info.find('来源:'):].split()[0].lstrip('来源:')
    else:
    source='none'
    if info.find('作者:') > 0:
    author = info[info.find('作者:'):].split()[0].lstrip('作者:')
    else:
    author = 'none'
    click=getClickCount(newsurl)
    print(dt,click,author,newsurl,title,source)


    def getListPage(listPageUrl):
    res=requests.get(listPageUrl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    for news in soup.select('li'):
    if len(news.select('.news-list-title'))>0:
    a=news.select('a')[0].attrs['href']
    getNewsDetail(a)

    ListPageUrl="http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res=requests.get(ListPageUrl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    n = int(soup.select('.a1')[0].text.rstrip('条'))//10+1

    getListPage(ListPageUrl)
    for i in range(n,n+1):
    listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    getListPage(listUrl)

  • 相关阅读:
    fedora 安装open office
    git rebase(转)
    javascript typeof
    正则表达式入门
    XML格式
    zz 通用线程:Awk 实例,第 3部分
    ELF BIN HEX
    i2c总线(iic总线/ I square C)
    grep
    把Nginx注册成Windows 系统服务(转载)
  • 原文地址:https://www.cnblogs.com/dengjinxiu/p/8798993.html
Copyright © 2020-2023  润新知