• 数据结构化与保存


    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    import pandas
    
    news_list = []
    
    
    def crawlOnePageSchoolNews(page_url):
        res0 = requests.get(page_url)
        res0.encoding = 'UTF-8'
        soup0 = BeautifulSoup(res0.text, 'html.parser')
        news = soup0.select('.news-list > li')
    
        for n in news:
            # print(n)
            print('**' * 5 + '列表页信息' + '**' * 10)
            print('新闻链接:' + n.a.attrs['href'])
            print('新闻标题:' + n.select('.news-list-title')[0].text)
            print('新闻描述:' + n.a.select('.news-list-description')[0].text)
            print('新闻时间:' + n.a.select('.news-list-info > span')[0].text)
            print('新闻来源:' + n.a.select('.news-list-info > span')[1].text)
            news = getNewDetail(n.a.attrs['href'])
            news['标题'] = n.select('.news-list-title')[0].text
            news_list.append(news)
        return news_list
    
    
    def getNewDetail(href):
        print('**' * 5 + '详情页信息' + '**' * 10)
        print(href)
        res1 = requests.get(href)
        res1.encoding = 'UTF-8'
        soup1 = BeautifulSoup(res1.text, 'html.parser')
        news = {}
        if soup1.select('#content'):
            news_content = soup1.select('#content')[0].text
            news['内容'] = news_content
            print(news_content)  # 文章内容
        else:
            news['内容'] = ''
        if soup1.select('.show-info'):  # 防止之前网页没有show_info
            news_info = soup1.select('.show-info')[0].text
        else:
            return news
        info_list = ['来源', '发布时间', '点击', '作者', '审核', '摄影']  # 需要解析的字段
        news_info_set = set(news_info.split('xa0')) - {' ', ''}  # 网页中的 获取后会解析成xa0,所以可以使用xa0作为分隔符
        # 循环打印文章信息
        for n_i in news_info_set:
            for info_flag in info_list:
                if n_i.find(info_flag) != -1:  # 因为时间的冒号采用了英文符所以要进行判断
                    if info_flag == '发布时间':
                        # 将发布时间字符串转为datetime格式,方便日后存储到数据库
                        release_time = datetime.strptime(n_i[n_i.index(':') + 1:], '%Y-%m-%d %H:%M:%S ')
                        news[info_flag] = release_time
                        print(info_flag + ':', release_time)
                    elif info_flag == '点击':  # 点击次数是通过文章id访问php后使用js写入,所以这里单独处理
                        news[info_flag] = getClickCount(href)
                    else:
                        news[info_flag] = n_i[n_i.index('') + 1:]
                        print(info_flag + ':' + n_i[n_i.index('') + 1:])
        print('————' * 40)
        return news
    
    
    def getClickCount(news_url):
        click_num_url = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'
        click_num_url = click_num_url.format(re.search('_(.*)/(.*).html', news_url).group(2))
        res2 = requests.get(click_num_url)
        res2.encoding = 'UTF-8'
        click_num = re.search("$('#hits').html('(d*)')", res2.text).group(1)
        print('点击:' + click_num)
        return click_num
    
    
    print(crawlOnePageSchoolNews('http://news.gzcc.cn/html/xiaoyuanxinwen/'))
    
    pageURL = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'
    res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    res.encoding = 'UTF-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    newsSum = int(re.search('(d*)条', soup.select('a.a1')[0].text).group(1))
    if newsSum % 10:
        pageSum = int(newsSum / 10) + 1
    else:
        pageSum = int(newsSum / 10)
    
    for i in range(2, pageSum + 1):
        crawlOnePageSchoolNews(pageURL.format(i))
    
    
    dit = pandas.DataFrame(news_list)
    dit.to_excel('test.xlsx')
    dit.to_csv('test.csv')
    
    print(dit[['作者', '来源']][:6])
    print(dit[(dit['来源'] == '学校综合办') & (dit['点击'] > 3000)])
    print(dit[dit['来源'].isin(['国际学院', '学生工作处'])])
  • 相关阅读:
    SAP全球企业官孙小群的生活智慧
    C++ vs Python向量运算速度评测
    C++ Error: no appropriate default constructor available
    危险的浮点数float
    Vagrant 手册之 Vagrantfile
    MySQL 服务器性能剖析
    Vagrant 手册之多个虚拟机 multi-machine
    Vagrant 手册之同步目录
    Vagrant 手册之同步目录
    MySQL 中的 information_schema 数据库
  • 原文地址:https://www.cnblogs.com/onlythisone/p/8810144.html
Copyright © 2020-2023  润新知