• pyspider示例代码六:传递参数


    传递参数

    示例一

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
    # Created on 2014-10-25 14:31:24
    
    import re
    import json
    from libs.pprint import pprint
    from libs.base_handler import *
    
    class Handler(BaseHandler):
        '''
        this is a sample handler
        '''
        crawl_config = {
        }
        proxy = ""
    
        @every(0, 30)
        def on_start(self):
            self.crawl(self.proxy+'http://www.douban.com/group/haixiuzu/discussion',
                       force_update=True, callback=self.index_page)
    
        @config(age=10)
        def index_page(self, response):
            for each in response.doc('tr > .title > a').items():
                self.crawl(self.proxy+each.attr.href, callback=self.detail_page)
    
        @config(age=30*24*60*60)
        def detail_page(self, response):
            assert response.url != "https://www.douban.com/"
            return {
                "url": response.url,
                "title": response.doc("#content h1").text(),
                "author": response.doc(".topic-content .from a").text(),
                "author_url": response.doc("DIV.topic-doc>H3>SPAN.from>A").attr.href,
                "imgs": [x.attr.src for x in response.doc('.topic-doc img').items()]
            }
            
        def on_result(self, result):
            if not result or not result['imgs']:
                return
            post_id = re.search("topic/(d+)", self.response.url).group(1)
            self.crawl("https://api.duoshuo.com/posts/import.json#"+post_id, method="POST",
                data={
                "short_name": "database",
                "secret": "8e5a5be8873ad7e9a59147c3cfd10e73",
                "posts[0][post_key]": post_id,
                "posts[0][thread_key]": "haixiuzu",
                "posts[0][message]": json.dumps(result).encode("base64").replace("
    ", "")
            }, callback=self.post_to_duoshuo)
    
    
        def post_to_duoshuo(self):
            pass

    示例二

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2016-08-30 19:11:28
    # Project: prieto
    
    import re
    from pyspider.libs.base_handler import *
    
    
    class Handler(BaseHandler):
    
        
        crawl_config = {
        }
    
        @every(minutes=24 * 60)
        def on_start(self): 
            for i in range(10000): 
                self.crawl('data:,step%d' % i, callback=self.gen_url, save=i) 
    
        @config(priority=0) 
        def gen_url(self, respond): 
            for i in range(respond.save * 700, (respond.save + 1) * 700): 
                self.crawl("http://bbs.fobshanghai.com/viewthread.php?action=printable&tid=%d" % i, callback=self.index_page) 
    
        @config(priority=1) 
        def index_page(self, respond): 
    
         
            # title = response.doc
            hr_black = u'<hr noshade="noshade" size="2" width="100%" color="#808080"/>'
            hr_blue = u'<br/><br/><br/><br/><hr noshade="noshade" size="2" width="100%" color="#698cc3"/>'
            
            #posts = respond.doc('body').html().split(hr_blue)[0].split(hr_black)[1:]
            
            if respond.doc('head').html().startswith('<meta'):
                return {
                   "tid": respond.url.split('=')[-1],
                   "url": respond.url,
                   "html": 'The specified thread does not exist.',
                }
            
            
            return {
                "tid": respond.url.split('=')[-1],
                "url": respond.url,
                #"t_author": posts[0].split('
    ')[1].split('<b>')[0].strip(), # 用正则更好
                "html": respond.doc.html(),
                #"replies": [i for i in posts[1:]]
            }
  • 相关阅读:
    eclipse自动切换到debug视图
    Android Studio 1.1.0 “关联源码” 或者“导入源码” ,又或者插件包
    Eclipse中如何安装和使用GrepCode插件 (转)
    转【Python】同时向控制台和文件输出日志logging
    AngularJs学习
    MongoDB聚合运算之mapReduce函数的使用(11)
    MongoDB聚合运算之group和aggregate聚集框架简单聚合(10)
    MongoDB的分片(9)
    MongoDB replication set副本集(主从复制)(8)(转)
    MongoDB的导入导出(7)
  • 原文地址:https://www.cnblogs.com/microman/p/6140809.html
Copyright © 2020-2023  润新知