• pyspider解析


    https://www.cnblogs.com/microman/p/6111711.html

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2017-12-07 13:40:43
    # Project: adquan
    
    from pyspider.libs.base_handler import *
    
    
    class Handler(BaseHandler):
        crawl_config = {
        }
        
        def __init__(self):
            self.deal = Deal()
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('http://creative.adquan.com/show/42759', callback=self.detail_page)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a[href^="http"]').items():
                self.crawl(each.attr.href, callback=self.detail_page)
    
        @config(priority=2)
        def detail_page(self, response):
            name = 'test'
            count = 0
            for img in response.doc('.con_Text img').items():
                url = img.attr.src
                if url:
                    dir_path = self.deal.mkDir(name)
                    extension = self.deal.getExtension(url)
                    file_name = str(count) + '.' + extension
                    count += 1
                    self.crawl(img.attr.src, callback=self.save_img, save={'dir_path': dir_path, 'file_name':file_name})
            return {
                "url": response.url,
                "title": response.doc('title').text(),
            }
        def save_img(self, response):
            content = response.content
            dir_path = response.save['dir_path']
            file_name = response.save['file_name']
            file_path = dir_path + '/' + file_name
            self.deal.saveImg(content, file_path)
    
        
    import os
    
    DIR_PATH = "E:/pyspider/"
    
    class Deal:
        def __init__(self):
            self.path = DIR_PATH
            if not self.path.endswith('/'):
                self.path = self.path + '/'
            if not os.path.exists(self.path):
                os.makedirs(self.path)
    
        def mkDir(self, path):
            path = path.strip()
            dir_path = self.path + path
            exists = os.path.exists(dir_path)
            if not exists:
                os.makedirs(dir_path)
                return dir_path
            else:
                return dir_path
    
        def saveImg(self, content, path):
            f = open(path, 'wb')
            f.write(content)
            f.close()
    
        def saveBrief(self, content, dir_path, name):
            file_name = dir_path + "/" + name + ".txt"
            f = open(file_name, "w+")
            f.write(content.encode('utf-8'))
    
        def getExtension(self, url):
            extension = url.split('.')[-1]
            return extension
    

      http://demo.pyspider.org/

  • 相关阅读:
    11g新特性-dba_users安全性的一些增强
    sysbench的安装与使用(with MySQL)
    参数table_open_cache
    参数max_allowed_packet
    解决linux下unzip中文有乱码的问题
    11g添加asm
    有了iscsi存储怎么让主机识别以及使用创建lvm
    用rlwrap使sqlplus可以上下翻页
    卸载已经安装的rpm包
    物化视图刷新慢--有可能是mv log被多个mv使用造成的
  • 原文地址:https://www.cnblogs.com/jiangjing/p/8001321.html
Copyright © 2020-2023  润新知