• 新浪新闻简单抓取备忘录


    一,抓取新浪新闻的简单方法

           1、requests
           2、pyquery解析
           3、自定义日志类logger.py

    代码中分别使用了python多进程multiprocessing、gevent和普通的循环进行对比提取。

    先提取文章列表,然后调用spiderDetail.py 中的方法云获取新闻具体内容
    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    
    import requests
    from pyquery import PyQuery as pq
    from logger import *
    from spiderDetail import *
    import time
    from multiprocessing import Pool
    import gevent.pool
    import gevent.monkey
    
    gevent.monkey.patch_all()
    
    sina_forex_url = 'http://finance.sina.com.cn/forex/'
    def get_index_pages():
        response = requests.get(sina_forex_url)
        if response.status_code == 200:
            response.encoding = 'utf-8'
            content = response.text
            newsSet = set()#用于存储全部的新闻页面url,因新浪新闻可能出现在多个版块中,因此执行去重操作
            # 获取头条新闻
            hot_ad_link = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > h1 > a')
            hot_ad_url = hot_ad_link.attr('href')
            hot_ad_title = hot_ad_link.text()
            logger.debug('头条新闻')
            print(hot_ad_url)
            newsSet.add(hot_ad_url)
    
            # 获取焦点新闻列表
            logger.debug('获取到焦点新闻')
            focus_news_lists = get_focus_news(content)
            for new in focus_news_lists:
                newsSet.add(new['url'])
                logger.debug(new)
            #获取滚动新闻
            logger.debug('获取到滚动新闻')
            roll_news_lists = get_roll_news(content)
            for new in roll_news_lists:
                newsSet.add(new['url'])
                logger.debug(new)
            # 获取24新闻快递
            logger.debug('获取到24小时新闻')
            hours24_news_lists = get_24hours_news(content)
            for new in hours24_news_lists:
                newsSet.add(new['url'])
                logger.debug(new)
            # 获取分析数据新闻
            logger.debug('获取到分析数据')
            analysis_news_lists = get_analysis_news(content)
            for new in analysis_news_lists:
                newsSet.add(new['url'])
                logger.debug(new)
            # 获取机构观点新闻
            logger.debug('获取到机构观点')
            institution_opinion_news_lists = get_institution_opinion_news(content)
            for new in institution_opinion_news_lists:
                newsSet.add(new['url'])
                logger.debug(new)
            # 获取专家观点新闻
            logger.debug('获取到专家观点')
            specialist_opinion_news_lists = get_specialist_opinion_news(content)
            for new in specialist_opinion_news_lists:
                newsSet.add(new['url'])
                logger.debug(new)
            # 获取人民币汇率新闻
            logger.debug('获取到人民币汇率')
            rmb_exchange_news_lists = get_RMB_exchange_news(content)
            for new in rmb_exchange_news_lists:
                newsSet.add(new['url'])
                logger.debug(new)
    
            #提取文章详细内容
            logger.debug('抓取新闻共计:' + str(len(newsSet)))
            #采用基本的循环模式
            #for url in newsSet:
            #    get_page_detail(url)
            
            #采用多进程模式
            #pool = Pool(5)
            #pool.map(get_page_detail,newsSet)
            #pool.close()
            #pool.join()
            
            #采用gevent多协程
            pool = gevent.pool.Pool(5)
            data = pool.map(get_page_detail,newsSet)
            
            return len(newsSet)
        else:
            logger.info('请求新浪外汇首页失误')
    
    #获取焦点新闻
    def get_focus_news(content):
        focus_news_list = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > div.ListB > ul > li')
        for li in focus_news_list.items():
            yield {
                'title':li.text(),
                'url':li('a').attr('href')
            }
    #获取滚动新闻
    def get_roll_news(content):
        roll_news_list = pq(content)('#wrap > div:nth-child(25) > div.Center > div.ListB > ul > li')
        for li in roll_news_list.items():
            yield {
                'title':li.text(),
                'url':li('a').attr('href')
            }
    #获取24新闻快递
    def get_24hours_news(content):
        roll_news_list = pq(content)('#wrap > div.PartA.Top10 > div.CenterB > div.ListB.ListE > ul > li')
        for li in roll_news_list.items():
            yield {
                'title':li('a').text() + li('span').text(),
                'url':li('a').attr('href')
            }
    #获取分析数据新闻
    def get_analysis_news(content):
    
        roll_news_list = pq(content)('#wrap > div:nth-child(28) > div.Center > div.ListE > ul > li')
        for li in roll_news_list.items():
            yield {
                'title':li('a').text() + li('span').text(),
                'url':li('a').attr('href')
            }
    #获取机构观点新闻
    def get_institution_opinion_news(content):
        roll_news_list = pq(content)('#wrap > div:nth-child(29) > div.Center > div.ListE > ul > li')
        for li in roll_news_list.items():
            yield {
                'title':li('a').text() + li('span').text(),
                'url':li('a').attr('href')
            }
    #获取专家观点新闻
    def get_specialist_opinion_news(content):
        roll_news_list = pq(content)('#wrap > div:nth-child(30) > div.Center > div.ListE > ul > li')
        for li in roll_news_list.items():
            yield {
                'title':li('a').text() + li('span').text(),
                'url':li('a').attr('href')
            }
    #获取人民币汇率新闻
    def get_RMB_exchange_news(content):
        roll_news_list = pq(content)('#wrap > div:nth-child(31) > div.Center > div.ListE > ul > li')
        for li in roll_news_list.items():
            yield {
                'title':li('a').text() + li('span').text(),
                'url':li('a').attr('href')
            }
    
    #get_index_pages()
    View Code
    提取文章详细内容类spiderDetail.py
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    import requests
    from requests.exceptions import RequestException
    from pyquery import PyQuery as pq
    import re
    from logger import *
    from DBHelper import *
    from hashlib import md5
    import json
    from bs4 import BeautifulSoup as bs
    
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                response.encoding = 'utf-8'
                content = response.text
                pqContent = pq(content)
                title = pqContent('#artibodyTitle').text()
                print(title)
                date = pqContent('#wrapOuter > div > div.page-info > span').text()[:16]
                print(date)
                cnt = bs(content,"lxml")
                body = cnt.find(id='artibody')
                blockquote = cnt.find('blockquote')
                if blockquote:
                    new_tag = cnt.new_tag("<b>")
                    new_tag.string = "<a href='www.mysite.com'>替换成自己网站的名称和地址</a>"
                    body.blockquote.replace_with(new_tag)
                #print(str(body))
    
                articleContent = pq(''.join(str(body)))
                #print(type(articleContent))
                #print(articleContent)
                if body:
                    # 获取正文内容
                    regex = re.compile('<!-- 原始正文start -->(.*)<!-- 原始正文end -->',re.S)
                    match = re.findall(regex,str(body))
                    if match:
                        match = match[0].strip()
                        images = pq(match)('img')
                        for img in images:
                            img_name = get_page_img(pq(img).attr('src'))
                            if img_name:
                                r = re.subn(pq(img).attr('src'), 'img/' + img_name, match)
                                match = r[0]
                        content_url = write_to_file(match, url)
                        dict = {'title':title,'content':content_url,'date':date,'expired':'false'}
                        insert(dict)
                    else:
                        logger.info('未能提取到文章正文:[%s]' % url)
    
                else:
                    logger.info('未在该文章页面中查找到标签artibody:[%s]' % url)
    
        except RequestException:
            logger.info('请求文章正文出错',url)
    
    #获取文章内容页中的图片
    def get_page_img(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                actual_img_path = save_image(response.content)
                return actual_img_path
            else:
                return None
        except RequestException:
            logger.info('请求图片出错',url)
            return None
    
    #保存文章页中的全部插图
    def save_image(content):
        img_folder = os.path.join(os.getcwd(), 'img')
        img_name = md5(content).hexdigest()
        img_path = '{0}/{1}.{2}'.format(img_folder,img_name,'jpg')
        if not os.path.exists(img_path):
            with open(img_path,'wb') as f:
                f.write(content)
                f.close()
                return img_name+'.jpg'
        else:
            return img_name
    # 将正文内容保存至文件中
    def write_to_file(content,url):
        content_folder = os.path.join(os.getcwd(),'files')
        file_name = md5(url.encode('utf-8')).hexdigest()
        file_path = '{0}/{1}.{2}'.format(content_folder, file_name,'txt')
        if not os.path.exists(file_path):
            with open(file_path,'w',encoding='utf-8') as f:
                f.write(json.dumps(content,ensure_ascii=False))
                f.close()
                logger.info('文件正文保存成功---新浪地址url:'+ url)
                return file_name
        else:
            return file_name
    View Code
    日志类 logger.py
    #!usr/bin/python3
    # -*- coding: utf-8 -*-
    
    import os
    import logging
    import time
    
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    
    #按天保存记录日志
    logFile = './log/log_{0}.txt'.format(time.strftime("%Y%m%d",time.localtime()))
    
    
    # if not os.path.exists(logFile):
    #     os.mknod(logFile)
    fh = logging.FileHandler(logFile,mode='a')
    fh.setLevel(logging.INFO)
    
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    
    formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s:%(message)s")
    fh.setFormatter(formater)
    ch.setFormatter(formater)
    
    logger.addHandler(fh)
    logger.addHandler(ch)
    View Code

    其它的类,Models.py,DBHelper.py,config.py

    DBHelper.py
    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    
    import pymysql
    
    from config import *
    from logger import *
    
    #
    #扩展功能
    #
    
    def insert(article):
        db = pymysql.connect(host=HOST,port=POST,user=USERNAME,passwd=PASSWORD,db=DATABASE,charset='utf8',use_unicode=True)
        cursor = db.cursor()
        sql = """insert into articles(title,content,date,expired) values('%s','%s','%s','%s')""" % (article['title'],article['content'],article['date'],article['expired'])
        try:
            cursor.execute(sql)
            db.commit()
            logger.info('插入文章记录成功,执行命令[' + sql + ']')
        except:
            logger.error('文章记录插入错误,执行命令[' + sql + ']')
            db.rollback()
        db.close()
    View Code
    数据库配置类config.py
    #database config items
    HOST = '127.0.0.1'
    POST = 6000            #数据库端口号,我更改为了6000
    DATABASE = '数据库名'
    USERNAME = '数据库账号'
    PASSWORD = '数据库密码'
    View Code
    Models.py(本实例未使用,可扩展编写,采用类操作文章)
    #!usr/bin/python3
    # -*- coding: utf-8 -*-
    
    class Article:
        ID = None,
        Title = '',
        SubTitle = '',
        Summary = '',
        Content = '',
        Date = '',
        Author = '',
        ForumID = 0,
        StickyPost = 'false',
        Expired = 'false'
    View Code

    二、配置nginx

    请求访问入口app.py(采用nginx+uwsgi)

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    from sinaForex import *
    import time
    from logger import *
    
    def application(env,start_response):
        start_response('200 ok',[('Content_Type','text/html')])
        s = time.time()    
        len = get_index_pages()
        e = time.time()
        logger.info("-----------------共爬取新闻{0}条,耗时:{1}-----------------".format(len,round(e-s,3)))    
        rst =   "共爬取新闻{0}条,耗时:{1}".format(len,round(e-s,3))
        print(time.localtime(time.time()))
        return [b'%s' % rst.encode('utf-8')]
    View Code
    nginx配置,例如使用nginx默认配置:../nginx/conf/vhost/py.mysite.com.conf

    server {
    listen 80;
    root /www/web/sina_forex;
    server_name py.mysite.com;
    index index.html index.php index.htm;
    error_page 400 /errpage/400.html;
    error_page 403 /errpage/403.html;
    error_page 404 /errpage/404.html;
    error_page 503 /errpage/503.html;

    location /spider {
    uwsgi_pass 127.0.0.1:8001;
    include uwsgi_params;
    }
    location / {
    try_files $uri @apache;
    }
    }

    uwsgi的配置(/etc/uwsgi8001.ini)
    [uwsgi]
    socket = :8001    #web服务端口
    chdir = /www/web/sina_forex   #网站根目录
    wsgi-file = app.py         #请求处理类
    vhost = true
    master = true
    processes = 5
    pidfile = /www/web/sina_forex/uwsgi8001.pid
    daemonize = /www/web/sina_forex/log/uwsgi8001.log
    View Code

    这样web服务即可正常获取,根据nginx的访问规则http://py.mysite.com/spider

    三、定时收集新闻

    写一个定时执行的类autoSpiderTimer.py,来定时执行web请求,而进程的管理采用supervisor

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import requests
    import time
    
    def timer():
        response = requests.get('http://py.mysite.com/spider')    
        print(response.text)
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        time.sleep(3600*24)
    
    timer()   #每天定时收集一次
    View Code
  • 相关阅读:
    IO流上机作业
    数据结构堆排序
    数据结构实训报告
    字符串的基本操作
    java窗口的简单切换
    判断一个串B位于串A的位置
    c语言实现数组转置,加减,乘法运算
    java异常处理
    弹奏乐器
    课程总结
  • 原文地址:https://www.cnblogs.com/ameile/p/8250301.html
Copyright © 2020-2023  润新知