• web.js


    var page = require('webpage').create(),
     system = require('system'),
     address,output,csvPath,nodePathFile,outOriginalimg,PCSPuserAgent;
    var fs = require("fs");
    if (system.args.length < 0) {
        console.log('Usage: rasterize.js URL filename');
        phantom.exit(1);
    } else {
        address = system.args[1];// ��ȁE�url
        output=system.args[2];
        csvPath=system.args[3];// �����csv
        PCSPuserAgent=system.args[6];
       console.log(PCSPuserAgent);
        var  BeforecsvPath=csvPath.replace('.csv','_1.csv');
        var csvindex=0;
      
         nodePathFile=system.args[4];// ��ȁE�nodepath�ļ�
         outOriginalimg=system.args[5];// ��ȡԭͼ
    
    if(PCSPuserAgent.indexOf("iPhone") > -1)
    {
         page.settings = {
           userAgent:PCSPuserAgent,
      	  javascriptEnabled: true,
          loadImages: true
    };
    }
    else
    {
         page.settings = {
          javascriptEnabled: true,
          loadImages: true
    };
    }
        page.viewportSize = {414,height:30};
        page.open(address, function (status) {
          //  fs.write('test.txt', "childNodes Num{_}Node Name{_}NodeIndexPath{_}Width{_}Height{_}x{_}y{_}Dispaly{_}ImgUrlOrBackImgUrl" + "
    ", 'a');
          if(fs.exists(csvPath))
          {
             fs.remove(csvPath);
           }
           window.setTimeout(function () { 
           VisiteHtmlDom("0",BeforecsvPath); 
     console.log("---------------------FS MOVE------------------------------");
           fs.move(BeforecsvPath,csvPath);
                }, 40000);
     waitFor2(
     function csvCreate()
    {  
      if(fs.exists(csvPath)){
    if(csvindex<5)
     {
    
    csvindex++;
    console.log("csvindex Index:"+csvindex);
     var csvPathNow=csvPath.replace('.csv','__'+csvindex.toString()+'.csv');
     var BeforecsvPathNow=csvPathNow.replace('.csv','_1.csv');
    if(fs.exists(csvPathNow))
          {
             fs.remove(csvPathNow);
           }
     console.log("csvindex:"+csvindex+","+"csvPathNow:"+csvPathNow+","+"BeforecsvPathNow:"+BeforecsvPathNow);
    VisiteHtmlDom("0",BeforecsvPathNow); 
    fs.move(BeforecsvPathNow,csvPathNow);
    
     }
     else
    {
     return true;
    }
    }
    },
      function csvCreate2()
    {
    console.log("-----------------csvPath2 END---------------");
    }
     ); 
    waitFor(
        function nodePathFileCheck()
       {
        if(!fs.exists(nodePathFile))
       { 
    console.log('nodepathFile Not Find')
       }
       else
       {
       console.log("Find nodePathFile");
       return true;
        }
      } ,
      	  function heheda()
        {    
    	window.setTimeout(function () { 
       console.log("---------------------Capture Original Begin------------------------------");	
             var scrollheight=page.evaluate(function () { 
              return  document.body.scrollHeight;
        });
           page.viewportSize = {414,height:scrollheight};
    
    		page.render(outOriginalimg);
      console.log("---------------------Capture Begin------------------------------");
         markCapture();
         console.log("scrollheight:"+scrollheight);
        window.setTimeout(function () {  
       page.render(output);
    	        page.close();
                 console.log('render ok');
                 phantom.exit();
    	      }, 10000);
          }, 25000);
     } 
     );
    
    });
    }
    function VisiteHtmlDom(nodePath,inputcsvPath) {
        var nodeinfo = page.evaluate(function (str) {
            var root = document.getElementsByTagName('html');
            var htmlNode = root[0];
          
            var xpathArr = str.split("/"); 
            	
            for (var i = 1; i < xpathArr.length; i++) {
    
                var index = parseInt(xpathArr[i]);
                if((htmlNode.nodeName.indexOf("SCRIPT") > -1) || (htmlNode.nodeName.indexOf("NOSCRIPT") > -1)|| (htmlNode.nodeName.indexOf("HEAD") > -1))
                {
                htmlNode = htmlNode;
                }
           else if(htmlNode.nodeName.indexOf("IFRAME") > -1&&window.getComputedStyle(htmlNode).display!="none")
            {
            var iframdom = htmlNode.contentWindow.document;
        
            var roots = iframdom.getElementsByTagName('html');
            htmlNode=roots[0];
          }
                
                else
                {
                htmlNode = htmlNode.childNodes[index];
                } 
            }
    
            if ((htmlNode.nodeName.indexOf("#text") > -1)) 
            {
                        return htmlNode.childNodes.length + "{_}" + htmlNode.nodeName+ "{_}" + str+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+ ("{_}" + htmlNode.nodeValue||"").replace(/
    |
    /ig, "");
            
            }
            else if((htmlNode.nodeName.indexOf("SCRIPT") > -1) || (htmlNode.nodeName.indexOf("NOSCRIPT") > -1)|| (htmlNode.nodeName.indexOf("HEAD") > -1)|| (htmlNode.nodeName.indexOf("#comment") > -1))
            {
            	
                return "0"+"{_}"+htmlNode.nodeName + "{_}" + str+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+"{_}";
             }
            else {
                // 
                var bgImgUrl= htmlNode.style.getPropertyValue("background-image");
              
                // var bgImgUrl=htmlNode.style.getPropertyValue("background-image").replace(/^(url)(|)/g, '');
                if(bgImgUrl!=null)
                    bgImgUrl=bgImgUrl.replace(/^(url)(|)/g, '');
              
                var width = window.getComputedStyle(htmlNode).width;
                var height = window.getComputedStyle(htmlNode).height;
                var left = window.getComputedStyle(htmlNode).left;
                var top = window.getComputedStyle(htmlNode).top;
                var display = window.getComputedStyle(htmlNode).display;
                switch (htmlNode.nodeName) {
                    case "IMG":
                        var imgUrl = htmlNode.src; //ȡͼƬ�ĵ�ַ
                        return htmlNode.childNodes.length + "{_}" + htmlNode.nodeName + "{_}" + str + "{_}" + width + "{_}" + height + "{_}" + left + "{_}" + top + "{_}" + display + "{_}" + imgUrl+ "{_}"+ "{_}"; //img bgurl txt
                        break;
                    case "EMBED":
                    case "#COMMENT":
                        return htmlNode.childNodes.length + "{_}" +htmlNode.nodeName+"{_}" + str+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+"{_}"+"{_}";
                        break;
                    case "IFRAME":
                         var iframeUrl = htmlNode.src; 
                    if(window.getComputedStyle(htmlNode).display!="none")
                       {
                         return "1" + "{_}" + htmlNode.nodeName + "{_}" + str + "{_}" + width + "{_}" + height + "{_}" + left + "{_}" + top + "{_}" + display + "{_}" + iframeUrl+ "{_}"+ "{_}"; //img bgurl txt
                         }
                         else
                         { 
                     		   return "0" + "{_}" + htmlNode.nodeName + "{_}" + str + "{_}" + width + "{_}" + height + "{_}" + left + "{_}" + top + "{_}" + display + "{_}" + iframeUrl+ "{_}"+ "{_}"; //img bgurl txt
                       }
                         default:
                         return htmlNode.childNodes.length + "{_}" + htmlNode.nodeName + "{_}" + str + "{_}" + width + "{_}" + height + "{_}" + left + "{_}" + top + "{_}" + display + "{_}" +"{_}"+ bgImgUrl+"{_}";
                         break;
                }
            }
        }, nodePath);
    
    console.log("create CSV");
        fs.write(inputcsvPath, nodeinfo + "
    ", 'a');
       
    
        // if(!(nodeinfo.indexOf("undefined")>-1))
    
        var childNodesCount = nodeinfo.split("{_}")[0].split("/")[0];
    
        for (var childIndex = 0; childIndex < childNodesCount;childIndex++) {
    
            var childNodesPath = nodePath + "/" + childIndex.toString();
    
            VisiteHtmlDom(childNodesPath,inputcsvPath);
    
        }
    }
    function markCapture()
    {
    	var stream = fs.open(nodePathFile, 'r');
        while(!stream.atEnd()) {
        var line = stream.readLine();
        page.evaluate(function (line) { 
            var root = document.getElementsByTagName('html');//��ȡdom
            var htmlNode = root[0];
            var xpathArr=line.split("/");
            for(var i=1;i<xpathArr.length;i++)
        {
        	var index=parseInt(xpathArr[i].substring(xpathArr[i].length - 3, 3));
      		htmlNode= htmlNode.childNodes[index];
        	if(htmlNode.nodeName.indexOf("IFRAME") > -1)
            {
            var iframdom = htmlNode.contentWindow.document;
            var roots = iframdom.getElementsByTagName('html');
            htmlNode=roots[0];
            i++;
            }
        }
         
           if (htmlNode.nodeName.indexOf("text")>-1)
              {
                htmlNode.parentNode.style.boxSizing = "border-box";
                htmlNode.parentNode.style.border = "5px solid #ff0000";
              }
              else
                 {
                  htmlNode.style.boxSizing = "border-box";
                  htmlNode.style.border = "5px solid #ff0000";
                  } 
          },line);
      }
      
    stream.close();
    }
     function waitFor(testFx, onReady, timeOutMillis) {
        var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 120000, //< Default Max Timout is 3m
            start = new Date().getTime(),
            condition = false,
            interval = setInterval(function() {
                if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
                    // If not time-out yet and condition not yet fulfilled
                    condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
                } else {
                    if(!condition) {
                        // If condition still not fulfilled (timeout but condition is 'false')
                        console.log("'waitFor()' timeout");
                        phantom.exit(1);
                    } else {
                        // Condition fulfilled (timeout and/or condition is 'true')
                        console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
                        typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
                        clearInterval(interval); //< Stop this interval
                    }
                }
            }, 5000); //< repeat  5000ms    
    }; 
     function waitFor2(testFx, onReady, timeOutMillis) {
        var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 120000, //< Default Max Timout is 3m
            start = new Date().getTime(),
            condition = false,
            interval = setInterval(function() {
                if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
                    // If not time-out yet and condition not yet fulfilled
                    condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
                } else {
                    if(!condition) {
                        // If condition still not fulfilled (timeout but condition is 'false')
                        console.log("'waitFor()' timeout");
                        phantom.exit(1);
                    } else {
                        // Condition fulfilled (timeout and/or condition is 'true')
                        console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
                    if(condition)
                    {
                        typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
                       
                        clearInterval(interval); //< Stop this interval 
                        }
                    }
                }
            }, 5000); //< repeat  5000ms    
    }; 
    

      

  • 相关阅读:
    实现SQL SERVER 下的PadLeft函数
    C#中String和string区别
    SQL经验分享(二)取得数据库中所有的表名、字段名以及字段属于哪个表
    JavaScript获取上传文件后缀名
    Url重写
    C#编程风格约定
    解决SharePoint 2003的爬网性能问题 之七
    [跨DB查询]查找SharePoint 2007中的Orphan Feature的SQL语句
    如何得到MOSS 2007的最近的100次爬网的信息
    解决SharePoint 2003的爬网性能问题 之八
  • 原文地址:https://www.cnblogs.com/c-x-a/p/7267747.html
Copyright © 2020-2023  润新知