• 一种通用数据采集的schema定义形式


    {
      "name": "凤凰金融",
      "notice": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
          }
        ],
      "comments": "网站通告"
    },
    "url": { "data": "attribute", "value": "http://www.fengjr.com/financing/list?type=cx"
    "comments": "本平台数据的采集URL"
    }, "project": { "data": "url", "url": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" }, "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "member": { "data": "sub_item", "sub_item": { "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "src-save": 0, "url": { "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" } }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "src-save": 1 }

    补充:

    {
      "name": "凤凰金融",
      "notice": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
          }
        ]
      },
      "url": {
        "data": "attribute",
        "value": "http://www.fengjr.com/financing/list?type=cx"
      },
      "project": {
        "data": "url",
        "url": {
          "data": "attribute",
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ],
          "template": ""
        },
        "title": {
          "data": "attribute",
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ]
        },
        "detail": {
          "name": "网贷列表",
          "title": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          },
          "amount": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          }
        }
      },
      "member": {
        "data": "sub_item",
        "sub_item": {
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ],
          "src-save": 0,
          "url": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ],
            "template": ""
          }
        },
        "detail": {
          "name": "会员材料",
          "title": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          },
          "amount": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          }
        }
      },
      "src-save": 1,

      "crawler": {

          "handler":"httpClient|selenium",
          "results":"html|json|text",
          "next_page": {
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
             ],
            "template": ""
          },
          "history": "re-crawl|skip|stop"
        }

    }
  • 相关阅读:
    网页定位导航特效
    学习笔记(一) HTML+CSS基础课程
    《javascript dom编程艺术》笔记(二)——美术馆示例
    《javascript dom编程艺术》笔记(一)——优雅降级、向后兼容、多个函数绑定onload函数
    javascript之事件处理
    将Emmet安装到到 Sublime text 3?
    如何将Emmet安装到到 Sublime text 3?
    Web前端研发工程师编程能力飞升之路
    childNodes 节点数量问题说明
    gerrit + ldap + phpldapadmin docker部署
  • 原文地址:https://www.cnblogs.com/feika/p/4281864.html
Copyright © 2020-2023  润新知