import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
#获取点击次数
def getClickCount(newsUrl):
newId=re.search('\_(.*).html',newsUrl).group(1).split('/')[1]
clickUrl="http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(newId)
clickStr = requests.get(clickUrl).text
count = re.search("hits').html('(.*)');", clickStr).group(1)
return count
#获取新闻详细信息
def getNewsDetail(newsurl):
resd=requests.get(newsurl)
resd.encoding='utf-8'
soupd=BeautifulSoup(resd.text,'html.parser')
title=soupd.select('.show-title')[0].text
info=soupd.select('.show-info')[0].text
dt=datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S')
if info.find('来源')>0:
source =info[info.find('来源:'):].split()[0].lstrip('来源:')
else:
source='none'
if info.find('作者:') > 0:
author = info[info.find('作者:'):].split()[0].lstrip('作者:')
else:
author = 'none'
click=getClickCount(newsurl)
print(dt,click,author,newsurl,title,source)
def getListPage(listPageUrl):
res=requests.get(listPageUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
a=news.select('a')[0].attrs['href']
getNewsDetail(a)
ListPageUrl="http://news.gzcc.cn/html/xiaoyuanxinwen/"
res=requests.get(ListPageUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
n = int(soup.select('.a1')[0].text.rstrip('条'))//10+1
getListPage(ListPageUrl)
for i in range(n,n+1):
listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
getListPage(listUrl)