• pyspider使用


    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-11-08 22:33:55
    # Project: qsbk
    
    from pyspider.libs.base_handler import *
    from lxml import html
    from urlparse import urljoin
    import datetime
    class Handler(BaseHandler):
        crawl_config = {
        }
        def __init__(self):
            self.start_url='https://www.qiushibaike.com/'
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl(self.start_url, callback=self.index_page)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            root=html.fromstring(response.content.decode('utf-8'))
            content_left_node = root.xpath("//div[@id='content-left']")
            div_node_list = content_left_node[0].xpath("./div")
            tasks=[]
            for div_node in div_node_list:
                title_node = div_node.xpath(
                    ".//div[@class='author clearfix']/a[contains(@onclick,'web-list-author-text')]/h2/text()")
                __content_url =div_node.xpath("./a[@class='contentHerf']/@href")
                content_url = urljoin(self.start_url, __content_url[0])
                content_node = div_node.xpath(".//div[@class='content']/span[1]")
                content = content_node[0].xpath('string(.)')
                name = title_node[0]
                info = ''.join(content)
                crawldate = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                item = {}
                item['name'] = name.strip() if name else name
                item['info'] = info.strip() if info else info
                item['crawldate'] = crawldate
                item['url'] = content_url
                tasks.append(item)
            return {'data':tasks}    
             
    

      

  • 相关阅读:
    13. Spring—AOP—JDK 的动态代理
    12. Spring — AOP 面向切面编程
    28-1 父组件传递数据给子组件 — props基本用法—驼峰命名说明
    【洛谷 1596】湖计数
    【洛谷 1280】尼克的任务
    【洛谷 3884】二叉树问题
    【洛谷 3384】模板树链剖分
    【洛谷 2089】烤鸡
    【洛谷 1706】全排列问题
    【洛谷 2692】覆盖
  • 原文地址:https://www.cnblogs.com/c-x-a/p/9932720.html
Copyright © 2020-2023  润新知