• nodejs 访问网站并操作xpath


    var xpath = require('xpath'); //引用xpath包
    var dom = require('xmldom-silent').DOMParser;//引用xmldom包
    var request=require('request');
    var fs=require('fs');
    var urlencode = require('urlencode');//引用url解码和编码包
    var headers = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25' //设置手机useragent
    };
    
    request(
    {
    url:"https://www.google.co.jp/search?hl=ja&newwindow=1&site=&source=hp&q=hotel&oq=",
    headers:headers
    },function(error,response,body)
    {
    findXpath(body);
    //fileWrite(body);
    console.log("ok");
    });
    
    function findXpath(xml){
    //var xml = "<book><title>Harry Potter</title></book>"
    var doc = new dom().parseFromString(xml)
    var XPATH_CITE = "//div[@id='mbEnd']//ol/li//cite/text()|//div[@id='tads']//ol/li//cite/text()|//div[@id='tadsb']//div[@class='ads-ad']//h3/text()";
    var XPATH_H3 = "//div[@class='ads-ad']//h3//text()";
    var XPATH_ADURL = "//div[@class='ads-ad']/h3/a/@href|//div[@id='tadsb']/ol/li/h3/a/@href";
    var XPATH_INFO = "//div[@id='mbEnd']//ol/li//div[@class='ac ads-creative']//text()|//div[@id='taw']//ol/li//div[contains(@class,'ads-creative')]//text()|//div[@class='ads-ad']//div[@class='ads-creative']//text()";
    var citeNodes = xpath.select(XPATH_CITE, doc);
    var h3Nodes = xpath.select(XPATH_H3, doc);
    var adInfoNodes = xpath.select(XPATH_INFO, doc);
    var adUrlNodes = xpath.select(XPATH_ADURL, doc);
    
    console.log("---------------------Node--------------Info-----------------------");
    
    for(var i=0;i<citeNodes.length;i++)
    {
    var citeTxt = citeNodes[i].nodeValue;//循环获取节点
    var h3Txt =h3Nodes[i].nodeValue;//循环获取节点
    var adUrlTxt = adUrlNodes[i].nodeValue.match(/adurl=(http[S]*$)/)[1].replace("adurl=","");//循环获取节点
    var adInfoTxt =adInfoNodes[i].nodeValue;//循环获取节点
    //var adUrl= UrlDecode(htmlDecode(adUrlTxt));
    var adUrl=urlencode.decode(adUrlTxt);
    console.log(citeTxt);
    console.log(h3Txt);
    console.log(adUrl);
    console.log(adInfoTxt);
    }
    }
    function fileWrite(body)
    {
    fs.writeFile('233.html', body, function (err) {
    if (err) throw err;
    console.log('Saved successfully'); //文件被保存
    });
    }
    

      

  • 相关阅读:
    洛谷P3811题解
    洛谷P3353在你窗外闪耀的星星-题解
    Map根据value来排序
    java8 groupby count
    Java反射
    maven profile环境切换
    获取nginx代理情况下的真实ip
    获取request里header的name和value
    git 删除iml文件
    java list 排序
  • 原文地址:https://www.cnblogs.com/c-x-a/p/5482187.html
Copyright © 2020-2023  润新知