• spider_keeper


      

      

    一 简介 

      spider_keeper 是一款开源的spider管理工具,可以方便的进行爬虫的启动,暂停,定时,同时可以查看分布式情况下所有爬虫日志,查看爬虫执行情况等功能。

     二 安装 部署

    安装环境
    ubuntu16.04
    python3.5
    pip3 install scrapy
    pip3 install scrapyd
    pip3 install scrapyd-client
    pip3 install scrapy-redis
    pip3 install SpiderKeeper

      部署:

    # 注意不要覆盖SpiderKeeper.db
    rsync -avz spider_keeper crawler_server1:/data --exclude '*.log' --exclude '*.pyc' --exclude '*.db' --exclude 'env'
    
    pip install -r requirements.txt

      运行:

    使用虚拟环境运行
    virtualenv -p /usr/bin/python3.6 env
    source env/bin/activate
    
    python run.py --port=50002 --username=spider --password=sxkjspider2018 --server=http://10.10.4.2:6800 --server=http://10.10.4.11:6800 --server=http://10.10.4.12:6800

    三 数据库四张表用处

      查看建表信息

    看所有表
    .table
    
    默认情况下,不会出现红框中的表头,需要之前设置,命令为:
    .header on
    
    如果只想查看具体一张表的表结构,比如查看emperors表,命令为:
    
    select * from sqlite_master where type="table" and name="emperors";

      sk_job_execution :  任务表,所有的任务(运行,取消,等待)都在这一张表里

      

    table|sk_job_execution|sk_job_execution|10|CREATE TABLE sk_job_execution (
        id INTEGER NOT NULL,
        date_created DATETIME,
        date_modified DATETIME,
        project_id INTEGER NOT NULL,
        service_job_execution_id VARCHAR(50) NOT NULL,
        job_instance_id INTEGER NOT NULL,
        create_time DATETIME,
        start_time DATETIME,
        end_time DATETIME,
        running_status INTEGER,
        running_on TEXT,
        PRIMARY KEY (id)
    )

      sk_job_instance : 定时任务表,存放所有项目的定时任务

    table|sk_job_instance|sk_job_instance|5|CREATE TABLE sk_job_instance (
        id INTEGER NOT NULL,
        date_created DATETIME,
        date_modified DATETIME,
        spider_name VARCHAR(100) NOT NULL,
        project_id INTEGER NOT NULL,
        tags TEXT,
        spider_arguments TEXT,
        priority INTEGER,
        "desc" TEXT,
        cron_minutes VARCHAR(20),
        cron_hour VARCHAR(20),
        cron_day_of_month VARCHAR(20),
        cron_day_of_week VARCHAR(20),
        cron_month VARCHAR(20),
        enabled INTEGER,
        run_type VARCHAR(20),
        PRIMARY KEY (id)
    )

    .  sk_project : 项目表 存放所有项目      

    type|name|tbl_name|rootpage|sql
    table|sk_project|sk_project|2|CREATE TABLE sk_project (
        id INTEGER NOT NULL,
        date_created DATETIME,
        date_modified DATETIME,
        project_name VARCHAR(50),
        PRIMARY KEY (id)
    )

       sk_spider : 爬虫表 存放所有爬虫

    table|sk_spider|sk_spider|3|CREATE TABLE sk_spider (
        id INTEGER NOT NULL,
        date_created DATETIME,
        date_modified DATETIME,
        spider_name VARCHAR(100),
        project_id INTEGER NOT NULL,
        PRIMARY KEY (id)
    )

      这四张表都在 spider下的 model.py

    四 接口 api

      在 spider 下的 controller.py 中,

      Api部分

          

    api.add_resource(ProjectCtrl, "/api/projects") # 返回项目列表 项目id 项目名称
    返回列表

    1 '''
    2 [
    3     {
    4         "project_id": 1,
    5         "project_name": "baidutieba_wz"
    6     }
    7 ]
    8 
    9 '''

    api.add_resource(SpiderCtrl, "/api/projects/<project_id>/spiders") # 返回爬虫列表
    返回列表

    1 '''
    2 [
    3     {
    4         "spider_instance_id": 1,
    5         "spider_name": "all_detail",
    6         "project_id": 1
    7     }
    8 ]
    9 '''

    api.add_resource(SpiderDetailCtrl, "/api/projects/<project_id>/spiders/<spider_id>") # 通过项目id爬虫id,返回爬虫字典

    1 '''
    2 {
    3     "spider_instance_id": 1,
    4     "spider_name": "all_detail",
    5     "project_id": 1
    6 }
    7 
    8 '''

    api.add_resource(JobCtrl, "/api/projects/<project_id>/jobs") # 通过项目ID 找所有的定时任务

     1 '''
     2 [
     3     {
     4         "job_instance_id": 2,
     5         "spider_name": "all_detail",
     6         "tags": null,
     7         "spider_arguments": "",
     8         "priority": 0,
     9         "desc": null,
    10         "cron_minutes": "0",
    11         "cron_hour": "*",
    12         "cron_day_of_month": "*",
    13         "cron_day_of_week": "*",
    14         "cron_month": "*",
    15         "enabled": true,
    16         "run_type": "periodic"
    17     }
    18 ]
    19 '''

    api.add_resource(JobDetailCtrl, "/api/projects/<project_id>/jobs/<job_id>") # 看不出来干啥用的

    api.add_resource(JobExecutionCtrl, "/api/projects/<project_id>/jobexecs") # 所有状态的任务字典

     1 '''
     2 
     3 {
     4     "PENDING": [],
     5     "RUNNING": [],
     6     "COMPLETED": [
     7         {
     8             "project_id": 1,
     9             "job_execution_id": 2,
    10             "job_instance_id": 2,
    11             "service_job_execution_id": "f91f3ed0341311e9a72c645aedeb0f3b",
    12             "create_time": "2019-02-19 15:00:00",
    13             "start_time": "2019-02-19 15:00:03",
    14             "end_time": "2019-02-19 15:24:53",
    15             "running_status": 3,
    16             "running_on": "http://127.0.0.1:6800",
    17             "job_instance": {
    18                 "job_instance_id": 2,
    19                 "spider_name": "all_detail",
    20                 "tags": null,
    21                 "spider_arguments": "",
    22                 "priority": 0,
    23                 "desc": null,
    24                 "cron_minutes": "0",
    25                 "cron_hour": "*",
    26                 "cron_day_of_month": "*",
    27                 "cron_day_of_week": "*",
    28                 "cron_month": "*",
    29                 "enabled": true,
    30                 "run_type": "periodic"
    31             }
    32         }
    33      
    34     ]
    35 }
    36 '''

      Router部分

      project_create : 创建项目   对应 scrapyd 的 AddVersion

      请求方式: POST

      参数: project_name : 项目名称

      代码如下:

    1 @app.route("/project/create", methods=['post'])
    2 def project_create():
    3     project_name = request.form['project_name']
    4     project = Project()
    5     project.project_name = project_name
    6     db.session.add(project)
    7     db.session.commit()
    8     return redirect("/project/%s/spider/deploy" % project.id, code=302)

      project_delete : 删除项目 对应 scrapyd 的 DeleteProject

      请求方式 : GET

      请求参数 : project_id  项目的 id

      代码如下:

    1 @app.route("/project/<project_id>/delete")
    2 def project_delete(project_id):
    3     project = Project.find_project_by_id(project_id)
    4     agent.delete_project(project)
    5     db.session.delete(project)
    6     db.session.commit()
    7     return redirect("/project/manage", code=302)

      project_manage 项目初始页面 项目修改(删除,添加后会跳转到这个页面)

      请求方式 : GET

      代码如下:

    @app.route("/project/manage")
    def project_manage():
        return render_template("project_manage.html")

      project_index 项目任务展示 展示项目的所有任务(等待,运行,取消,成功)

      请求方式 : GET

      请求参数: 项目 ID project_id

      代码如下:

    1 @app.route("/project/<project_id>")
    2 def project_index(project_id):
    3     session['project_id'] = project_id
    4     return redirect("/project/%s/job/dashboard" % project_id, code=302)

      

      index 初始页面 如果没有项目跳转到 project_manage 如果有项目 跳转到第一个项目的任务页面 project_index

      请求: GET

      代码如下:

    1 @app.route("/")
    2 def index():
    3     project = Project.query.first()
    4     if project:
    5         return redirect("/project/%s/job/dashboard" % project.id, code=302)
    6     return redirect("/project/manage", code=302)

      job_dashboard 定时任务展示 展示本项目的所有定时任务 periodic(周期性)

      请求方式: GET

      请求参数: project_id 项目ID

      代码如下:

    1 @app.route("/project/<project_id>/job/periodic")
    2 def job_periodic(project_id):
    3     project = Project.find_project_by_id(project_id)
    4     job_instance_list = [job_instance.to_dict() for job_instance in
    5                          JobInstance.query.filter_by(run_type="periodic", project_id=project_id).all()]
    6     return render_template("job_periodic.html",
    7                            job_instance_list=job_instance_list)

      job_add 添加任务 定时任务 一次性任务 auto 任务

      请求方式: GET

      请求参数 :  项目ID project_id

                爬虫名称 : spider_name

                spider_arguments : 不知道干啥的

            是否是周期性: priority (默认是 0 )

            运行类型 : run_type

            分钟 : cron_minutes

            小时: cron_hour

            每月某日: cron_day_of_month

            每周某日: cron_day_of_week

            每月 : cron_month

      代码如下:

     1 @app.route("/project/<project_id>/job/add", methods=['post'])
     2 def job_add(project_id):
     3     project = Project.find_project_by_id(project_id)
     4     job_instance = JobInstance()
     5     job_instance.spider_name = request.form['spider_name']
     6     job_instance.project_id = project_id
     7     job_instance.spider_arguments = request.form['spider_arguments']
     8     job_instance.priority = request.form.get('priority', 0)
     9     job_instance.run_type = request.form['run_type']
    10     # chose daemon manually
    11     if request.form['daemon'] != 'auto':
    12         spider_args = []
    13         if request.form['spider_arguments']:
    14             spider_args = request.form['spider_arguments'].split(",")
    15         spider_args.append("daemon={}".format(request.form['daemon']))
    16         job_instance.spider_arguments = ','.join(spider_args)
    17     if job_instance.run_type == JobRunType.ONETIME:
    18         job_instance.enabled = -1
    19         db.session.add(job_instance)
    20         db.session.commit()
    21         agent.start_spider(job_instance)
    22     if job_instance.run_type == JobRunType.PERIODIC:
    23         job_instance.cron_minutes = request.form.get('cron_minutes') or '0'
    24         job_instance.cron_hour = request.form.get('cron_hour') or '*'
    25         job_instance.cron_day_of_month = request.form.get('cron_day_of_month') or '*'
    26         job_instance.cron_day_of_week = request.form.get('cron_day_of_week') or '*'
    27         job_instance.cron_month = request.form.get('cron_month') or '*'
    28         # set cron exp manually
    29         if request.form.get('cron_exp'):
    30             job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = 
    31                 request.form['cron_exp'].split(' ')
    32         db.session.add(job_instance)
    33         db.session.commit()
    34     return redirect(request.referrer, code=302)

      job_stop 暂停任务 将运行中的任务停止

      请求方式: GET

      请求参数:

        project_id 项目 ID

        job_exec_id 任务 ID

      代码如下:

    1 @app.route("/project/<project_id>/jobexecs/<job_exec_id>/stop")
    2 def job_stop(project_id, job_exec_id):
    3     job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first()
    4     agent.cancel_spider(job_execution)
    5     return redirect(request.referrer, code=302)

      job_log 显示任务日志

      请求方式 :GET

      请求参数: 

        project_id 项目 ID

        job_exec_id 任务 ID

      代码如下;

     1 @app.route("/project/<project_id>/jobexecs/<job_exec_id>/log")
     2 def job_log(project_id, job_exec_id):
     3     job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first()
     4     bytes = -5000
     5     while True:
     6         res = requests.get(agent.log_url(job_execution),
     7                            headers={'Range': 'bytes={}'.format(bytes)},
     8                            )
     9         if '<span>builtins.OSError</span>: <span>[Errno 22] Invalid argument</span>' not in res.text:
    10             break
    11         else:
    12             bytes = bytes / 10
    13     res.encoding = 'utf8'
    14     raw = res.text
    15     return render_template("job_log.html", log_lines=raw.split('
    '))

      job_run 启动定时任务

      请求方式 :GET

      请求参数 : 

        project_id 项目ID

        job_instance_id 定时任务的ID

      代码如下:

    1 @app.route("/project/<project_id>/job/<job_instance_id>/remove")
    2 def job_remove(project_id, job_instance_id):
    3     job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first()
    4     db.session.delete(job_instance)
    5     db.session.commit()
    6     return redirect(request.referrer, code=302)

      job_remove 移除定时任务

      请求方式:GET

      请求参数 :

        project_id 项目ID

        job_instance_id 定时任务的ID

      代码如下:

    1 @app.route("/project/<project_id>/job/<job_instance_id>/remove")
    2 def job_remove(project_id, job_instance_id):
    3     job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first()
    4     db.session.delete(job_instance)
    5     db.session.commit()
    6     return redirect(request.referrer, code=302)

     

       job_switch 定时任务状态切换

      请求方式 : GET

      请求参数:

        project_id 项目ID

        job_instance_id 定时任务的ID

      代码如下:

     

    1 @app.route("/project/<project_id>/job/<job_instance_id>/switch")
    2 def job_switch(project_id, job_instance_id):
    3     job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first()
    4     job_instance.enabled = -1 if job_instance.enabled == 0 else 0
    5     db.session.commit()
    6     return redirect(request.referrer, code=302)

      spider_dashboard 爬虫展示

       请求方式 : GET

        请求参数:

          project_id 项目ID

        代码如下:

        

    1 @app.route("/project/<project_id>/spider/dashboard")
    2 def spider_dashboard(project_id):
    3     spider_instance_list = SpiderInstance.list_spiders(project_id)
    4     return render_template("spider_dashboard.html",
    5                            spider_instance_list=spider_instance_list)

      spider_deploy 项目配置 egg文件上传页面

      请求方式 : GET

      请求参数: 

        project_id 项目ID

      代码如下:

    1 @app.route("/project/<project_id>/spider/deploy")
    2 def spider_deploy(project_id):
    3     project = Project.find_project_by_id(project_id)
    4     return render_template("spider_deploy.html")

      spider_egg_upload egg 文件上传

      请求方式 : POST

      请求参数 :

        project_id : 项目 ID

        file : egg 文件

      代码如下:

     1 @app.route("/project/<project_id>/spider/upload", methods=['post'])
     2 def spider_egg_upload(project_id):
     3     project = Project.find_project_by_id(project_id)
     4     if 'file' not in request.files:
     5         flash('No file part')
     6         return redirect(request.referrer)
     7     file = request.files['file']
     8     # if user does not select file, browser also
     9     # submit a empty part without filename
    10     if file.filename == '':
    11         flash('No selected file')
    12         return redirect(request.referrer)
    13     if file:
    14         filename = secure_filename(file.filename)
    15         dst = os.path.join(tempfile.gettempdir(), filename)
    16         file.save(dst)
    17         agent.deploy(project, dst)
    18         flash('deploy success!')
    19     return redirect(request.referrer)

      project_stats 项目运行统计(每个项目起始时间) 返回一个图形界面

      请求方式 : GET

      请求参数 :

        project_id : 项目 ID

      代码如下:

    1 @app.route("/project/<project_id>/project/stats")
    2 def project_stats(project_id):
    3     project = Project.find_project_by_id(project_id)
    4     run_stats = JobExecution.list_run_stats_by_hours(project_id)
    5     return render_template("project_stats.html", run_stats=run_stats)

      

      service_stats 服务器统计信息

      请求方式 :GET

      请求参数:

        project_id : 项目 ID

      代码如下 :

    1 @app.route("/project/<project_id>/server/stats")
    2 def service_stats(project_id):
    3     project = Project.find_project_by_id(project_id)
    4     run_stats = JobExecution.list_run_stats_by_hours(project_id)
    5     return render_template("server_stats.html", run_stats=run_stats)

        

      

            

  • 相关阅读:
    SVM
    决策树
    神经网络
    机器学习之降维方法
    机器学习之特征选择
    浏览器状态码大全
    哈希表
    社区发现算法总结(二)
    社区发现算法总结(一)
    聚类篇-------度量
  • 原文地址:https://www.cnblogs.com/wzbk/p/10334639.html
Copyright © 2020-2023  润新知