• 获取标题


    Sub GetCatalogPages()
          For n = 1 To 20
                CatalogURL = "http://blog.sina.com.cn/s/_" & n & ".html"
               Call GetCatalogByUrl(CatalogURL)
          Next n
    End Sub
    Sub GetCatalogByUrl(ByVal CatalogURL As String)
        'Dim CatalogURL As String
        Dim WebText As String
        Dim OneSpan As Object
        Dim OneA As Object
        Dim Wb As Workbook
        Dim Sht As Worksheet
        Dim Rng As Range
        Dim i As Long, j As Long
        
        Dim StartTime As Variant    '开始时间
        Dim UsedTime As Variant    '使用时间
        StartTime = VBA.Timer    '记录开始时间
        
        AppSettings
    
        
        Set Wb = Application.ThisWorkbook
        Set Sht = Wb.Worksheets("Catalog")
        With Sht
            '.UsedRange.Offset(1).ClearContents
            'i = 1
          endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
          i = endrow
            '发送请求
            With CreateObject("MSXML2.XMLHTTP")
                .Open "GET", CatalogURL, False
                .Send
                WebText = .responsetext
            End With
            '创建网页文件 创建 Html Dom
            'Microsoft HTML Object Library
            With CreateObject("htmlfile")
                .write WebText
                For Each OneA In .getElementsByTagName("a")
                    href = OneA.href
                    If href Like "*http://blog.sina.com.cn/s/blog_*" Then
                        i = i + 1
                        Sht.Cells(i, 2).Value = href
                       '     Sht.Hyperlinks.Add Sht.Cells(i, 2), href ', href
                    End If
                Next OneA
                 i = endrow
                For Each OneMeta In .getElementsByTagName("meta")
                    If OneMeta.Name = "description" Then
                        cnt = OneMeta.Content
                        'Debug.Print cnt
                        titles = Split(Split(cnt, "xxxx,")(1), ",")
                        For n = LBound(titles) To UBound(titles) Step 1
                            i = i + 1
                            Sht.Cells(i, 1).Value = titles(n)
                        Next n
                    End If
                Next OneMeta
            End With
        End With
        AppSettings False
        UsedTime = VBA.Timer - StartTime
        Debug.Print "采集     " & CatalogURL; " :  " & Format(UsedTime, "#0.0000秒")
        'MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒")
    End Sub
    Sub GetQuestionsByExamUrl()
        
        Dim Wb As Workbook
        Dim Sht As Worksheet
        Dim oSht As Worksheet
    Set Wb = Application.ThisWorkbook
        
        Set Sht = Wb.Worksheets("Catalog")
        Set oSht = Wb.Worksheets("Question")
        
        With Sht
            endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
            Set Rng = .Range("A2:B" & endrow)
            Arr = Rng.Value
        End With
        With oSht
            r = 1
            For i = LBound(Arr) To UBound(Arr)
                ExamTitle = Arr(i, 1)
                ExamUrl = Arr(i, 2)
                ExamText = GetExamTextByUrl(ExamUrl)
                Ques = RegGetArray(ExamText, "([((]d[))][^
    ]*)[
    ]")
                For n = LBound(Ques) To UBound(Ques) Step 1
                    r = r + 1
                    .Cells(r, 1).Value = ExamTitle
                    .Cells(r, 2).Value = ExamUrl
                    .Cells(r, 3).Value = Ques(n)
                Next n
                
            Next i
        End With
        
        
        
        Set Wb = Nothing
        Set Sht = Nothing
        Set oSht = Nothing
        
    End Sub
    
    Function GetExamTextByUrl(ByVal ExamUrl As String) As String
           '发送请求
            With CreateObject("MSXML2.XMLHTTP")
                .Open "GET", ExamUrl, False
                .Send
                WebText = .responsetext
                'Debug.Print WebText
            End With
            With CreateObject("htmlfile")
                .write WebText
               Set examdiv = .getElementById("sina_keyword_ad_area2")
               ' Debug.Print examdiv.innerText
              GetExamTextByUrl = examdiv.innerText
            End With
    End Function
    Private Sub AppSettings(Optional IsStart As Boolean = True)
        If IsStart Then
            Application.ScreenUpdating = False
            Application.DisplayAlerts = False
            Application.Calculation = xlCalculationManual
            Application.StatusBar = ">>>>>>>>Macro Is Running>>>>>>>>"
        Else
            Application.ScreenUpdating = True
            Application.DisplayAlerts = True
            Application.Calculation = xlCalculationAutomatic
            Application.StatusBar = False
        End If
    End Sub
    
    Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
        Dim Reg As Object, Mh As Object, OneMh As Object
        Dim Arr() As String, Index As Long
        Dim Elm As String
        Set Reg = CreateObject("Vbscript.Regexp")
        With Reg
            .MultiLine = True
            .Global = True
            .Ignorecase = False
            .Pattern = Pattern
            Set Mh = .Execute(OrgText)
            
            Index = 0
            ReDim Arr(1 To 1)
            For Each OneMh In Mh
                Index = Index + 1
                ReDim Preserve Arr(1 To Index)
                'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
                Arr(Index) = OneMh.submatches(0)
                'Debug.Print OneMh.submatches(0)
            Next OneMh
        End With
        RegGetArray = Arr
        Set Reg = Nothing
        Set Mh = Nothing
    End Function
    
    
    Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Dim Mh As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        If Regex.test(OrgText) Then
            Set Mh = Regex.Execute(OrgText)
            RegGet = Mh.Item(0).submatches(0)
        Else
            RegGet = ""
        End If
        Set Regex = Nothing
    End Function
    

      

  • 相关阅读:
    《程序员修炼之道》——第二章 注重实效的途径(三)
    《程序员修炼之道》——第二章 注重实效的途径(二)
    《程序员修炼之道》——第二章 注重实效的途径(一)
    win10 磁盘占用高--- 禁用用户改善反馈 CompatTelRunner.exe
    ffmpeg拼接多个音频
    词云-wordcloud
    大数据指数日常应用
    搜索过滤Tip : title,site(搜标题和搜网站)
    eclipse下查看java源码设置
    sqlplus sys as sysdba
  • 原文地址:https://www.cnblogs.com/nextseven/p/8437061.html
Copyright © 2020-2023  润新知