• 爬虫大作业


    import requests
    from bs4 import BeautifulSoup
    
    url = 'http://news.sise.edu.cn/cms/6145.html'
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('li'):
        if len(news.select('.list-unstyled list-inline')):
            break
    print(news)
    def writeNewsDetail(content):
        f = open('News.txt', 'a',encoding='utf-8')
        f.write(content)
        f.close()
    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re

    获取网址

    def getClickCount(newsUrl):
        newId = re.search('cms(.*).html', newsUrl).group(1).split('/')[1]
        url = 'http://news.sise.edu.cn/cms/{}.html'
        res = requests.get(url)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        return (int(requests.get(url).text.split('.html')[-1].lstrip("('").rstrip("');")))

    新闻内容

    def getNewsDetail(newsUrl):  #一篇新闻的全部信息
        resd = requests.get(newsUrl)
        resd.encoding = 'utf-8'
        soupd = BeautifulSoup(resd.text, 'html.parser')  # 打开新闻详情页并解析
    
        news ={}
        news['title'] = soupd.select('.text-muted-5')[0].text
        info = soupd.select('.list-unstyled list-inline')
        for infos in info:
            news['dt'] = datetime.strptime(info.xpath('li[5]'), '%H:%M:%S %Y-%m-%d')
            news['content'] = soupd.select('.MsoNormal')[0].text.strip()
            #writeNewsDetail(news['content'])
            news['click'] = soupd.select('li[6]')
            news['newsUrl'] = newsUrl
            return(news)

    全部新闻列表

    def getListPage(pageUrl):  #一个列表页的全部新闻
        res = requests.get(pageUrl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
    
        newsList = []
        for news in soup.select('li'):
            if len(news.select('.media-body')) > 0:
                newsUrl = news.select('a')[0].attrs['href']  # 链接
                newsList.append(getNewsDetail(newsUrl))
        return(newsList)
    
    def getPageN():
        res = requests.get('http://news.sise.edu.cn/cms/news/2.html')
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        n = int(soup.select('.a1')[0].text.rstrip(''))
        return (n // 12 + 1)
    
    newsTotal = []
    firstPageUrl = 'http://news.sise.edu.cn/cms/news/2.html'
    newsTotal.extend(getListPage(firstPageUrl))
    
    n = getPageN()
    for i in range(n, n+1):
        listPageUrl = 'http://news.sise.edu.cn/cms/news/2/p/{}.html'.format(i)
        newsTotal.extend(getListPage(listPageUrl))
  • 相关阅读:
    【转】jenkins更新主题
    【原】jenkins常用的plugin
    作业:简单的主机批量管理工具
    信号量, 事件,队列
    paramiko模块介绍
    多线程介绍
    作业:开发支持多用户在线FTP程序
    判断操作系统的三种方法
    socketserver
    新博客地址
  • 原文地址:https://www.cnblogs.com/0056a/p/8987698.html
Copyright © 2020-2023  润新知