• pyspider示例代码五:实现自动翻页功能


    实现自动翻页功能

    示例代码一

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2016-05-19 00:21:31
    # Project: v2ex
    
    from pyspider.libs.base_handler import *
    #import re
    
    
    class Handler(BaseHandler):
        crawl_config = {
        }
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('https://www.v2ex.com/', callback=self.index_page, validate_cert=False)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
                self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)
    
        @config(age=10 * 24 * 60 * 60)
        def tab_page(self, response):
            for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
                self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
    
        @config(priority=2)
        def board_page(self, response):
            #实现自动翻页功能
            for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
                url = each.attr.href
                if url.find('#reply')>0:
                    url = url[0:url.find('#')]
                self.crawl(url, callback=self.detail_page, validate_cert=False)
            for each in response.doc('a.page_normal').items():
                self.crawl(each.attr.href, callback=self.board_page, validate_cert=False) 
                
        @config(priority=2)
        def detail_page(self, response):
            title = response.doc('h1').text()
            content = response.doc('div.topic_content').html().replace('"', '\"')
            tmp = zip(response.doc('a[href^="/member/"]').items(), response.doc('div.reply_content').items())
            reply_content = list()
            for e1, e2 in tmp:
                reply_content.append((e1.text(), e2.text()))
            #self.add_question(title, content)  #插入数据库
            return {
                "url": response.url,
                "title": title,
                "content": content,
                "reply_content": reply_content,
            }

    示例代码二

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2015-01-04 10:42:01
    # Project: tutorial_douban_movie
    
    import re
    from pyspider.libs.base_handler import *
    
    
    class Handler(BaseHandler):
        """
        This is a sample script for: pyspider 爬虫教程(一):HTML 和 CSS 选择器
        http://blog.binux.me/2015/01/pyspider-tutorial-level-1-html-and-css-selector/
        """
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('http://movie.douban.com/tag/', callback=self.index_page)
    
        @config(age=24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a[href^="http"]').items():
                if 'tag' in each.attr.href:
                    self.crawl(each.attr.href, callback=self.list_page)
                    
        @config(age=10*24*60*60, priority=2)
        def list_page(self, response):
            for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV>TABLE TR.item>TD>DIV.pl2>A').items():
                self.crawl(each.attr.href, priority=9, callback=self.detail_page)
            # 翻页
            for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV.paginator>A').items():
                self.crawl(each.attr.href, callback=self.list_page)
        
        @config(priority=3)
        def detail_page(self, response):
            return {
                "url": response.url,
                "title": response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN').text(),
                "rating": response.doc('#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong').text(),
                "导演": [x.text() for x in response.doc('a[rel="v:directedBy"]').items()],
            }
  • 相关阅读:
    javascript教程系列-10.DOM(下)
    javascript教程系列-9.DOM(上)
    javascript教程系列-8.BOM
    javascript教程系列-7.Date对象
    javascript教程系列-6.String类型
    javascript教程系列-5.数组
    Python字符串、元组、列表、字典互相转换的方法
    python 中函数
    python中的字符串 列表 字典
    python 基础 知识
  • 原文地址:https://www.cnblogs.com/microman/p/6140803.html
Copyright © 2020-2023  润新知