import requests from bs4 import BeautifulSoup url = 'http://news.sise.edu.cn/cms/6145.html' res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('li'): if len(news.select('.list-unstyled list-inline')): break print(news)
def writeNewsDetail(content): f = open('News.txt', 'a',encoding='utf-8') f.write(content) f.close()
import requests from bs4 import BeautifulSoup from datetime import datetime import re
获取网址
def getClickCount(newsUrl): newId = re.search('cms(.*).html', newsUrl).group(1).split('/')[1] url = 'http://news.sise.edu.cn/cms/{}.html' res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') return (int(requests.get(url).text.split('.html')[-1].lstrip("('").rstrip("');")))
新闻内容
def getNewsDetail(newsUrl): #一篇新闻的全部信息 resd = requests.get(newsUrl) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') # 打开新闻详情页并解析 news ={} news['title'] = soupd.select('.text-muted-5')[0].text info = soupd.select('.list-unstyled list-inline') for infos in info: news['dt'] = datetime.strptime(info.xpath('li[5]'), '%H:%M:%S %Y-%m-%d') news['content'] = soupd.select('.MsoNormal')[0].text.strip() #writeNewsDetail(news['content']) news['click'] = soupd.select('li[6]') news['newsUrl'] = newsUrl return(news)
全部新闻列表
def getListPage(pageUrl): #一个列表页的全部新闻 res = requests.get(pageUrl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') newsList = [] for news in soup.select('li'): if len(news.select('.media-body')) > 0: newsUrl = news.select('a')[0].attrs['href'] # 链接 newsList.append(getNewsDetail(newsUrl)) return(newsList) def getPageN(): res = requests.get('http://news.sise.edu.cn/cms/news/2.html') res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') n = int(soup.select('.a1')[0].text.rstrip('条')) return (n // 12 + 1) newsTotal = [] firstPageUrl = 'http://news.sise.edu.cn/cms/news/2.html' newsTotal.extend(getListPage(firstPageUrl)) n = getPageN() for i in range(n, n+1): listPageUrl = 'http://news.sise.edu.cn/cms/news/2/p/{}.html'.format(i) newsTotal.extend(getListPage(listPageUrl))