• 爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离


    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    
    
    def getClick(newsUrl):
        newId = re.search('\_(.*).html', newsUrl).group(1).split('/')[1]
        click = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newId))
        return click.text.split('.html')[-1].lstrip("('").rstrip("');")
    
    
    def getNews(newsUrl):
        f = re.search('作者:((.{3}s){1,3})', info).group(1)
        date = re.search('(d{4}.d{2}.d{2}sd{2}.d{2}.d{2})', info).group(1)
        i = re.search('审核:((.{3}s){1,3})', info).group(1)
        l = re.search('来源:((.{3}s){1,3})', info).group(1)
        m = re.search('摄影:((..{3}s){1,3})',info).group(1)
        count = getClick(newsUrl)
        dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
        print('发布时间:{0}'.format(dateTime))
        print('作者:{0}'.format(f))
        print('审核:{0}' .format(i))
        print('来源:{0}' .format(l))
        print('摄影:{0}' .format(m))
        print('点击:{0}' .format(count) + '')
    
    
    
    for news in soup.select('li'):
        if len(news.select('.news-list-title')) > 0:
            link = news.a.attrs['href']
            t = news.select('.news-list-title')[0].text
            d = news.select('.news-list-description')[0].text
            newsUrl = news.select('a')[0]['href']
            res = requests.get(newsUrl)
            res.encoding = 'utf-8'
            soupb = BeautifulSoup(res.text, 'html.parser')
            info = soupb.select('.show-info')[0].text
            e = soupb.select('.show-content')[0].text
    
            print('标题:' + t)
            print('链接:' + newsUrl)
            print('描述:' + d)
            print('正文:' + e)
            getNews(newsUrl)
            break
     
    

  • 相关阅读:
    Redis源码分析(二十一)--- anet网络通信的封装
    leetcode 总结part1
    leetcode String to Integer (atoi)
    leetcode 165. Compare Version Numbers
    leetcode 189. Rotate Array
    leetcode 168. Excel Sheet Column Title
    leetcode 155. Min Stack
    leetcode 228. Summary Ranges
    leetcode 204. Count Primes
    leetcode 6. ZigZag Conversion
  • 原文地址:https://www.cnblogs.com/170he/p/8763050.html
Copyright © 2020-2023  润新知