获取全部校园新闻

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

#获取点击次数
def getClickCount(newsUrl):
newId=re.search('\_(.*).html',newsUrl).group(1).split('/')[1]
clickUrl="http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(newId)
clickStr = requests.get(clickUrl).text
count = re.search("hits').html('(.*)');", clickStr).group(1)
return count

#获取新闻详细信息
def getNewsDetail(newsurl):
resd=requests.get(newsurl)
resd.encoding='utf-8'
soupd=BeautifulSoup(resd.text,'html.parser')
title=soupd.select('.show-title')[0].text
info=soupd.select('.show-info')[0].text
dt=datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S')

if info.find('来源')>0:
source =info[info.find('来源：'):].split()[0].lstrip('来源：')
else:
source='none'
if info.find('作者：') > 0:
author = info[info.find('作者：'):].split()[0].lstrip('作者：')
else:
author = 'none'
click=getClickCount(newsurl)
print(dt,click,author,newsurl,title,source)

def getListPage(listPageUrl):
res=requests.get(listPageUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
a=news.select('a')[0].attrs['href']
getNewsDetail(a)

ListPageUrl="http://news.gzcc.cn/html/xiaoyuanxinwen/"
res=requests.get(ListPageUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
n = int(soup.select('.a1')[0].text.rstrip('条'))//10+1

getListPage(ListPageUrl)
for i in range(n,n+1):
listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
getListPage(listUrl)

相关阅读:
fedora 安装open office
git rebase(转)
javascript typeof
正则表达式入门
XML格式
zz 通用线程：Awk 实例，第 3部分
ELF BIN HEX
i2c总线（iic总线/ I square C）
grep
把Nginx注册成Windows 系统服务(转载)

原文地址：https://www.cnblogs.com/dengjinxiu/p/8798993.html