• 利用chrome插件批量读取浏览器页面内容并写入数据库


      试想一下,如果每天要收集100页网页数据甚至更多。如果采用人工收集会吐血,用程序去收集也就成为一个不二的选择。首先肯定会想到说用java、php、C#等高级语言,但这偏偏又有个登陆和验证码,搞到无所适从。还在为收集web端的数据感到苦恼吗?很高兴,你找对地方了。

      应用场景:

        1、需要每天大量重复收集web端的数据

        2、web页面数据需要登陆后才能采集

        3、web页面存在翻页

      解决方案:

        手工登陆,然后采用chrome插件的方式进行收集。当然你会说用selenium等自动化测试的方法进行收集更cool,而且可以每天自动收集,完全的自动化不用人工参与。但是作为chrome的忠实脑残粉,再者只需要前端的js、服务器端的接收文件、数据库就可以完美解决这一问题。再加上部署和操作简单。 脑残粉总有很多理由的嘛。好吧,就算是一种憋屈的曲线救国和实现方式吧。

      思路:

        

    帮助手册:http://open.chrome.360.cn/extension_dev/overview.html

    实例:

    抓取某电商后台订单数据

    1、创建一个项目文件夹并引入所需文件:如D: oolchrome_server_plugin

      jquery-2.1.1.min.js、icon.png

    2、创建background.html

    <html><head>
    </head></html>

    3、创建配置文件manifest.json文件

    {
    
    "name": "获取某电商后台订单信息",
    "version": "1.0",
    "manifest_version": 2,
    "description": "*********获取某电商后台订单信息*********",
    "browser_action": {
    "default_icon": "icon.png"
    },
    "permissions": [
    "webNavigation",
    "tabs",
    "contextMenus",
    "http://服务器接受数据url/"
    ],
    "background": {
    "scripts": ["eventPage.js","jquery-2.1.1.min.js"]
    },
    "content_scripts": [
    {
    "matches": ["http://抓取页面url/*"],
    "js": ["jquery-2.1.1.min.js", "contentscript.js"]
    }
    ]
    }

    4、创建前端js文件contentscript.js

    var totalPage;
    var page = 0;
    //注册前台页面监听事件
    chrome.extension.onMessage.addListener(
      function(request, sender, sendResponse) {
        totalPage = $("input[name=totalPage]").val();
        console.log("totalPage----------" + totalPage);
        //console.log("msg----------contentscript.js" + request.greeting);
        getOrderInfo( sendResponse );
      });
    
    //获取订单信息
    function getOrderInfo( sendResponse ){
      var flag = false;
    
      payMoney = [];//货款金额
      orderTime = [];//下单时间
      $("tr[class=head] span").each(function(index){
        spantxt = '';
        spantxt = $(this).text();
        if(spantxt.indexOf('货款金额:') > -1){
          money = spantxt.substr(5);
          //console.log(index + "---------payMoney-------货款金额:" + money);
          payMoney.push(money);
        }else if(spantxt.indexOf('下单时间:') > -1){
          time = spantxt.substr(5);
          //console.log(index + "---------orderTime-------下单时间:" + time);
          orderTime.push(time);
        }
      });
    
    paytype = [];//物流方式
    yunfei = [];//运费
    $("td[class=p-values]").each(function(index){
      tdtxt = '';
      tdtxt = $(this).text();
      if(tdtxt.indexOf('货到付款') > -1){
        paytype.push('货到付款');
      }else{
        paytype.push('在线支付');
      }
    
      yf_index = tdtxt.indexOf('运费:');
      if(yf_index > -1){
        temp = tdtxt.substr(yf_index);
        temp_yf = temp.substr(3);
        //console.log(index + "---------yunfei-------"+ temp +"===" + temp_yf);
        yunfei.push(temp_yf);
      }else{
        yunfei.push(0);
      }
    
      //console.log(index + "---------tdtxt-------" + tdtxt);
    });
    
    orderStatus = [];//订单状态
    users = [];//买家账号
    remark = [];//备注
    $("tr[class=content] td[class=t-c]").each(function(index){
      tdtxt = '';
      tdtxt = $(this).text().replace(/[
    ] +/g,"");//将回车,换行,空格去掉
      temp = index % 5;
      if(1 == temp){
        orderStatus.push(tdtxt);
        //console.log(index + "---------statu-------" + tdtxt);
      }else if(2 == temp){
        users.push(tdtxt);
        //console.log(index + "---------users-------" + tdtxt);
      }else if(3 == temp){
        remark.push(tdtxt);
        //console.log(index + "---------remark-------" + tdtxt);
      }
    });
    
    express = [];//快递单号
    $("tr[class=content] td div[style='text-align: center;']").each(function(index){
      tdtxt = '';
      tdtxt = $(this).text().replace(/[
    ] +/g,"");//将回车,换行,空格去掉
      express.push(tdtxt);
      //console.log( "============快递单号=======" + tdtxt);
    });
    
    orderInfo = [];
    splitstr = "@_@";
    $("tr[class=head] a[track=orderinfopagebeta]").each(function(index){
      orderid = $(this).text();
      //console.log("---------orderid-------" + orderid);
      mycomment = $("a[id=comment_" + orderid + "]").attr('style').replace(/[
    ] +/g,"");
      if("display: block;" == mycomment){
        mycomment = '已评价';
      }else if('display:none;' == mycomment){
        mycomment = '未评价';
      }
    
      tempshopid = $("img[id=remarkFlag_" + orderid + "]").attr('onclick');
      shopidIndex = tempshopid.indexOf(",");
      shopid = tempshopid.substr(shopidIndex + 1).replace(/[);]/g,"");
      //console.log("---------shopid-------" + shopid);
      orderdesc = shopid + splitstr + orderid + splitstr + mycomment + splitstr + payMoney[index] + splitstr + orderTime[index] + splitstr + paytype[index] + splitstr + yunfei[index] + splitstr + orderStatus[index] + splitstr + users[index] + splitstr + remark[index] + splitstr + express[index];
      console.log("---------orderdesc-------" + orderdesc);
      orderInfo.push(orderdesc);
    });
    
    //chrome.extension.sendMessage({"orderInfo": orderInfo}, function(response) {});
    page = parseInt($("a[class=current]").text());
    totalPage = parseInt($("input[name=totalPage]").val());
    console.log(page + "--page-----------totalPage---" + totalPage);
    if(page < totalPage && page < 100){
      console.log("---------next-------");
      sendMsg( orderInfo, "next" );
      $('a.next')[1].click();
    }else{
      console.log("---------end-------");
      sendMsg( orderInfo, "end" );
    }
    //
    
    }
    
    //将获取内容传递给后台文件进行处理
    function sendMsg( msg, cmd){
      chrome.extension.sendMessage({"msg": msg, "cmd": cmd}, function(response) {});
    }

    5、创建后台处理js文件eventPage.js

    var flag = false;
    var currentTabId;
    chrome.browserAction.onClicked.addListener(function(tab) {
      counter = 40;
      console.log('Turning ' + tab.url);
      flag = true;
      currentTabId = tab.id;
      chrome.tabs.getSelected(null, function(tab) {
        sendMsg(tab.id);
      });
    });
    
    
    chrome.webNavigation.onCompleted.addListener(function( tab ){
      console.log('加载完成***' + tab.tabId );
      if( flag ){
        sendMsg( tab.tabId );
      }
    });
    
    chrome.extension.onMessage.addListener(
    
    function(request, sender, sendResponse) {
      console.log("*******evenPage.js***chrome.extension.onMessage.addListener"); 
      articleData = request;
      $.ajax({
        url: "服务器接受数据URL/getOrderinfo.php",
        cache: false,
        type: "POST",
        data: {'orderinfo': request.msg.join("#$#")},
        dataType: "json"
        }).done(function(msg) {
          console.log('*******************json*************' + msg.sql );
          chrome.tabs.sendMessage(currentTabId, {"cmd":"end"}, 
          function(response) { 
          console.log(response); 
     });
    
    }).fail(function(jqXHR, textStatus) {
      articleData.firstAccess = textStatus;
    });
    
    cmd = request.cmd;
    if('end' == cmd){
      flag = false;//确保不会自动运行
    }
    
    });
    
    function sendSku2Info(colores){
      chrome.tabs.query(
        {active: true, currentWindow: true}, function(tabs) {
          chrome.tabs.sendMessage(tabs[0].id, {"cmd":"ok", "sku": colores}, 
          function(response) { 
            console.log(response); 
          });
    
      });
    }
    
    function sendMsg( tabid ){
      console.log(tabid + "--sendMsg()----eventPage.js");
      chrome.tabs.sendMessage(tabid, {greeting: "start working"}, function(response) {
      });
    }

    4、创建服务器接收文件getOrderInfo.php(放在服务器哦,亲!)

    <?php
    
    header("Content-type:text/html; charset=utf-8");
    //include("./includes/global.php");
    echo "***********************";
    $con = mysql_connect("localhost","root","root");
    echo "==============";
    if (!$con)
    {
      die('Could not connect: ' . mysql_error());
    }
    mysql_select_db("test", $con);
    //var_dump($_REQUEST);
    $orderinfo = $_POST['orderinfo'];
    $orderArr = explode('#$#', $orderinfo);
    print_r($orderArr);
    $sql_value = array();
    $split = "', '";
    foreach($orderArr as $myorder){
      $value = explode('@_@', $myorder);
      echo "===========" . $value[10] ."</br>";
      $sql = "INSERT INTO test(venderId, orderid, pingjia, money, ordertime, paytype, yunfei, orderstatu, user, remark, express) VALUES ";
      $sql .= "('" . $value[0] . $split . $value[1] . $split . $value[2] . $split . $value[3] . $split . $value[4] . $split . $value[5] . $split . $value[6] . $split .     $value[7] . $split . $value[8] . $split . $value[9] . $split . $value[10] . "') ON DUPLICATE KEY UPDATE remark = '" . $value[9] . "', pingjia = '" .     $value[2] . "', orderstatu = '" . $value[7] . "', express = '" . $value[10] . "'";
      mysql_query($sql);
    }
    mysql_close($con);
    
    ?>

    5、创建数据库表

    CREATE TABLE `test` (
    `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT 'id',
    `venderId` int(10) NOT NULL DEFAULT '0' COMMENT '商家ID',
    `orderid` bigint(20) NOT NULL DEFAULT '0' COMMENT '订单ID',
    `pingjia` varchar(100) NOT NULL DEFAULT '' COMMENT '订单发出后的状态(是否评价)',
    `money` decimal(10,2) NOT NULL DEFAULT '0.00' COMMENT '订单金额',
    `ordertime` varchar(100) NOT NULL DEFAULT '' COMMENT '下单时间',
    `paytype` varchar(100) NOT NULL DEFAULT '' COMMENT '付款方式',
    `yunfei` decimal(10,2) NOT NULL DEFAULT '0.00' COMMENT '运费',
    `orderstatu` varchar(100) NOT NULL DEFAULT '' COMMENT '订单状态',
    `user` varchar(255) NOT NULL DEFAULT '' COMMENT '订单用户',
    `remark` varchar(255) NOT NULL DEFAULT '' COMMENT '备注',
    `express` varchar(255) NOT NULL DEFAULT '' COMMENT '物流方式和运单号',
    `shop_id` int(10) unsigned NOT NULL DEFAULT '0' COMMENT '店铺表ID',
    `shop_name` varchar(255) NOT NULL DEFAULT '' COMMENT '店铺名称',
    `stattime` int(11) NOT NULL DEFAULT '0' COMMENT '下单年月日',
    PRIMARY KEY (`id`),
    UNIQUE KEY `orderid` (`orderid`)
    ) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT=''

    注意:各文件中红色部分需要手工修改,此外如果修改js文件名,请在manifest.json中修改相应的配置信息即可。

    插件加载步骤:

      

    示例代码:https://github.com/jackgitgz/chrome_server_plugin

    此示例仅供参考,有理解不到位或错误的还请指出。

  • 相关阅读:
    AngularJS(3)-过滤器
    AngularJS(2)-Scope作用域和控制器
    iOS局部刷新
    python(一)入门
    Java基础
    AngularJS(1)随笔
    mac下如何查看指定端口被谁占用并且杀死该进程
    Python 字节码bytecode
    Python 作用域和命名空间
    Python函数的默认参数的设计【原创】
  • 原文地址:https://www.cnblogs.com/rwxwsblog/p/4490530.html
Copyright © 2020-2023  润新知