• 爬虫大作业


    import requests
    from bs4 import BeautifulSoup
    
    url = 'http://news.sise.edu.cn/cms/6145.html'
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('li'):
        if len(news.select('.list-unstyled list-inline')):
            break
    print(news)
    def writeNewsDetail(content):
        f = open('News.txt', 'a',encoding='utf-8')
        f.write(content)
        f.close()
    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re

    获取网址

    def getClickCount(newsUrl):
        newId = re.search('cms(.*).html', newsUrl).group(1).split('/')[1]
        url = 'http://news.sise.edu.cn/cms/{}.html'
        res = requests.get(url)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        return (int(requests.get(url).text.split('.html')[-1].lstrip("('").rstrip("');")))

    新闻内容

    def getNewsDetail(newsUrl):  #一篇新闻的全部信息
        resd = requests.get(newsUrl)
        resd.encoding = 'utf-8'
        soupd = BeautifulSoup(resd.text, 'html.parser')  # 打开新闻详情页并解析
    
        news ={}
        news['title'] = soupd.select('.text-muted-5')[0].text
        info = soupd.select('.list-unstyled list-inline')
        for infos in info:
            news['dt'] = datetime.strptime(info.xpath('li[5]'), '%H:%M:%S %Y-%m-%d')
            news['content'] = soupd.select('.MsoNormal')[0].text.strip()
            #writeNewsDetail(news['content'])
            news['click'] = soupd.select('li[6]')
            news['newsUrl'] = newsUrl
            return(news)

    全部新闻列表

    def getListPage(pageUrl):  #一个列表页的全部新闻
        res = requests.get(pageUrl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
    
        newsList = []
        for news in soup.select('li'):
            if len(news.select('.media-body')) > 0:
                newsUrl = news.select('a')[0].attrs['href']  # 链接
                newsList.append(getNewsDetail(newsUrl))
        return(newsList)
    
    def getPageN():
        res = requests.get('http://news.sise.edu.cn/cms/news/2.html')
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        n = int(soup.select('.a1')[0].text.rstrip(''))
        return (n // 12 + 1)
    
    newsTotal = []
    firstPageUrl = 'http://news.sise.edu.cn/cms/news/2.html'
    newsTotal.extend(getListPage(firstPageUrl))
    
    n = getPageN()
    for i in range(n, n+1):
        listPageUrl = 'http://news.sise.edu.cn/cms/news/2/p/{}.html'.format(i)
        newsTotal.extend(getListPage(listPageUrl))
  • 相关阅读:
    myisamchk命令修复表操作
    CentOS下的yum upgrade和yum update区别
    CentOS 6.9/7通过yum安装指定版本的MySQL
    Linux下Shell去除空行的方法
    Linux下环境变量设置技巧
    交互设计师如何做运营需求-以网易严选邀请新人功能设计为例
    对应用启动时间的关注和获取
    快速发现并解决maven依赖传递冲突
    mock测试方法及实践改进
    网易杭研易盾实习心得(4)
  • 原文地址:https://www.cnblogs.com/0056a/p/8987698.html
Copyright © 2020-2023  润新知