• VBA分别使用MSXML的DOM属性和XPATH进行网页爬虫


    本文要重点介绍的是VBA中的XmlHttp对象(MSXML2.XMLHTTP或MSXML.XMLHTTP),它可以向http服务器发送请求并使用微软XML文档对象模型Microsoft XML Document Object Model (DOM)处理回应。练习抓取的网页例子是https://www.qppstudio.net/public-holidays-by-date/month1.htm

    第一种方法——DOM经典属性:

    参考http://club.excelhome.net/thread-1233167-1-1.htmlhttps://www.jianshu.com/p/1920550cb4a6

    Sub Main()
    ActiveSheet.Cells.Clear
    Url = "https://www.qppstudio.net/public-holidays-by-date/month1.htm"
    Set oHttp = CreateObject("MSXML2.XMLHTTP") '创建一个xmlhttp对象
    Set odom = CreateObject("htmlfile") '创建一个Dom对象
    With oHttp
    'open,创建一个新的http请求,并指定此请求的方法、URL以及验证信息(用户名/密码) 'send,发送请求到http服务器并接收回应 .Open "GET", Url, False '使用Open方法,用get请求,False代表非异步加载
        .Open "GET", Url, False '使用Open方法,用get请求,False代表非异步加载
        .send '将open方法的信息发送给网页服务器
         odom.body.innerHTML = .responseText '将响应网页的HTML赋值给Dom对象,并只需要body标签里面的内容
    End With
    dom (odom)
    End Sub
    Sub dom(odom As Object) i = 2 For Each Item In odom.all If Item.className = "list-item" Then For Each itemch In Item.Children If itemch.className = "list-item-heading" Then Range("a" & i) = itemch.innerText ElseIf itemch.className = "list-subitem" Then Range("b" & i) = itemch.Children(1).innerText Range("c" & i) = itemch.Children(3).innerText i = i + 1 End If Next Exit For End If Next End Sub

    第二种方法——转换为XML并使用XPATH(比较麻烦):

    参考http://club.excelhome.net/thread-1233167-1-1.html

    Sub Main()
    Url = "https://www.qppstudio.net/public-holidays-by-date/month1.htm"
    Set oHttp = CreateObject("MSXML2.XMLHTTP") '创建一个xmlhttp对象
    Set odom = CreateObject("htmlfile") '创建一个Dom对象
    With oHttp
    'open,创建一个新的http请求,并指定此请求的方法、URL以及验证信息(用户名/密码) 'send,发送请求到http服务器并接收回应 .Open "GET", Url, False '使用Open方法,用get请求,False代表非异步加载
        .Open "GET", Url, False '使用Open方法,用get请求,False代表非异步加载
        .send '将open方法的信息发送给网页服务器
         odom.body.innerHTML = .responseText '将响应网页的HTML赋值给Dom对象,并只需要body标签里面的内容
    End With
    
    '需要先将html文本进行格式化才能写入xmldoc,才能使用自带的xpath,比如节点一定要有开始和结束,节点属性一定要用双引号括起来
    '例如
    'sXML = "<NewDataSet class=""123""><MyTable>"
    'sXML = sXML & "     <Active>true</Active>"
    'sXML = sXML & "     <SQLServer>APCD03</SQLServer>"
    'sXML = sXML & "     <SQLDatabase>OIS</SQLDatabase>"
    'sXML = sXML & " </MyTable>"
    'sXML = sXML & " <MyTable>"
    'sXML = sXML & "     <Active>false</Active>"
    'sXML = sXML & "     <SQLServer>APCD04</SQLServer>"
    'sXML = sXML & "     <SQLDatabase>OIS</SQLDatabase>"
    'sXML = sXML & " </MyTable></NewDataSet>"
    'Debug.Print sXML
    Dim sXML As String, xDoc, a, nodelist, node
    For Each Item In odom.all
      If Item.className = "list-item" Then
        sXML = Item.outerHTML
        Exit For
      End If
    Next
    sXML = rr(sXML, "<IMG.*?>", "")
    sXML = rr(sXML, "class=.*?>", ">")
    Set xDoc = CreateObject("MSXML.DOMDocument")
    a = xDoc.LoadXML(sXML)
    'a为true时代表写入成功,为false代表写入失败
    'Debug.Print a
    '一旦a为false就可以先写入txt再看哪些还不符合xml规范
    'file = ThisWorkbook.Path & "	est.txt"
    'Open file For Output As #1
    'Print #1, sXML
    'Close #1
    Set nodelist = xDoc.SelectNodes("//P")
    Set node = xDoc.SelectSingleNode("//P")
    'Debug.Print nodelist.Length
    For Each Item In nodelist
    Debug.Print Item.Text
    Next
    End Sub
    
    Function rr(str As String, pattern As String, repstr As String)
    Set reg = CreateObject("vbscript.regexp")
    With reg
    .Global = True
    .pattern = pattern
    End With
    rr = reg.Replace(str, repstr)
    End Function
  • 相关阅读:
    Leveldb Advanced
    loadrunner生成随机数
    用strtok函数分割字符串
    loadrunner关联及web_reg_save_param方法浅析
    Linux中find用法
    在LoadRunner中从数组类型的参数随机取值的方法
    LoadRunner可以把关联取值当作检查点来使用
    Oracle特殊字符转义:&amp;和&#39;
    LoadRunner 11 error:Cannot initialize driver dll
    用SecureCRT在windows和CentOS间上传下载文件
  • 原文地址:https://www.cnblogs.com/JTCLASSROOM/p/11132518.html
Copyright © 2020-2023  润新知