• 试卷


    Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
    Private Declare Function DeleteUrlCacheEntry Lib "wininet" Alias "DeleteUrlCacheEntryA" (ByVal lpszUrlName As String) As Long
    Public Sub Crawler()
        Dim StartTime As Variant '开始时间
        Dim UsedTime As Variant '使用时间
        StartTime = VBA.Timer '记录开始时间
        
        With Sheets("试卷URL")
            i = 2
            Do While .Cells(i, 1).Value <> ""
                OneKeyCreateExam .Cells(i, 2).Text
                i = i + 1
                If i = 1000 Then Exit Do
            Loop
        End With
        
        UsedTime = VBA.Timer - StartTime
        MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒")
        
    End Sub
    '下载网络图片
    Sub DownloadImageName(ByVal ImageURL As String, ByVal ImagePath As String)
        Dim lngRetVal As Long
        lngRetVal = URLDownloadToFile(0, ImageURL, ImagePath, 0, 0)
        If lngRetVal = 0 Then
            DeleteUrlCacheEntry ImageURL  '清除缓存
            'MsgBox "成功"
        Else
            'MsgBox "失败"
        End If
    End Sub
    
    Sub OneKeyCreateExam(ByVal URL As String)
        Dim ImgNames As Variant
        Dim strText As String
        Dim i As Long, n As Long, m As Long
        Dim OneTagP As Object
        Dim OneTagA As Object
        Dim TagP As Object
        Dim PosText As String
        Dim Arr() As String
        ReDim Arr(1 To 1) As String
        Dim Brr() As String
        ReDim Brr(1 To 1)
        Dim ImageURL As String
        Dim FilePath As String
        Dim FileName As String
        
        Dim dContent As Object
        Set dContent = CreateObject("Scripting.Dictionary")
        Dim dImageName As Object
        Set dImageName = CreateObject("Scripting.Dictionary")
        
        Dim StartTime As Variant    '开始时间
        Dim UsedTime As Variant    '使用时间
        StartTime = VBA.Timer    '记录开始时间
        
        AppSettings
       On Error GoTo ErrHandler
        'Debug.Print URL
        '设置URL,访问网页获取网页源码
        'URL = ActiveSheet.Range("A2").Text    '"http://blog.sina.com.cn/s/blog_5a18c50f0102x8lg.html"
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", URL, False
            .Send
            strText = .responsetext
        End With
        
        '创建网页文件
        With CreateObject("htmlfile")
            .write strText
            
            '获取标题
            FileName = .getElementsByTagName("h2")(0).innerhtml
            Debug.Print "开始正在采集>>>>>>"; FileName; "   网络地址 :" & URL
            
            Application.StatusBar = ">>>>>>正在下载图片>>>>>>"
            
            i = 0    '初始化序号
            
            For Each OneTagA In .getElementsByTagName("a")
                '循环所有A标签
                If OneTagA.HasChildNodes Then
                    If OneTagA.href Like "http://photo.blog.sina.com.cn/showpic.html*" Then
                        
                        
                        If OneTagA.ParentNode.tagname = "DIV" Then
                            Set TagP = OneTagA.PreviousSibling
                            Do While TagP.tagname <> "P"
                                Set TagP = TagP.PreviousSibling
                            Loop
                            i = i + 1
                            '文字内容提取
                            PosText = TagP.innerhtml
                            PosText = RegReplace(PosText, "<.*?>")
                            PosText = Replace(PosText, " ", "")
                            
                            '获取图片URL
                            ImageURL = OneTagA.FirstChild.getAttribute("real_src")
                            ImageName = "Image" & i & ".jpg"
                            ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                            DownloadImageName ImageURL, ImagePath    '下载图片
                            
                            '获取图片
                            If dImageName.Exists(PosText) = False Then
                                dImageName(PosText) = ImageName
                            Else
                                dImageName(PosText) = dImageName(PosText) & "|" & ImageName
                            End If
                        ElseIf OneTagA.ParentNode.tagname = "P" Then
                            'On Error Resume Next
                            Set TagP = OneTagA.ParentNode.PreviousSibling
                            
                            i = i + 1
                            '文字内容提取
                            PosText = TagP.innerhtml
                            PosText = RegReplace(PosText, "<.*?>")
                            PosText = Replace(PosText, " ", "")
                            
                            '获取图片URL
                            ImageURL = OneTagA.FirstChild.getAttribute("real_src")
                            ImageName = "Image" & i & ".jpg"
                            ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                            DownloadImageName ImageURL, ImagePath    '下载图片
                            
                            '获取图片
                            If dImageName.Exists(PosText) = False Then
                                dImageName(PosText) = ImageName
                            Else
                                dImageName(PosText) = dImageName(PosText) & "|" & ImageName
                            End If
                            
                            
                        End If
                    End If
                End If
            Next OneTagA
            
            Application.StatusBar = ">>>>>>正在获取文本>>>>>>"
            
            Dim IsContent As Boolean
            i = 0    '初始化序号
            n = 0    '初始化序号
            For Each OneTagP In .getElementsByTagName("p")
                '文字内容提取
                
                PosText = OneTagP.innerhtml
                PosText = RegReplace(PosText, "<.*?>")
                PosText = Replace(PosText, " ", "")
                
                i = i + 1
                If (InStr(PosText, "地理") > 0 And i > 15) Or i > 20 Then IsContent = True
                If PosText = "喜欢" Then Exit For    '提前结束循环
                
                If IsContent Then
                    If Len(PosText) > 0 Then
                        n = n + 1
                        ReDim Preserve Arr(1 To n) '保留非空数组  '开始记录试卷内容
                        Arr(n) = PosText    '存入数组
                        '   Debug.Print n; "               "; PosText
                        'dContent(PosText) = n
                    End If
                End If
            Next
            
            
        End With
        
        
        
        Application.StatusBar = ">>>>>>正在创建Word文档>>>>>>"
        
        FilePath = ThisWorkbook.Path & "" & FileName & ".doc"
        On Error Resume Next
        Kill FilePath
        On Error GoTo 0
        
        Dim wdApp As Object
        Dim Doc As Object
        Set wdApp = CreateObject("Word.Application")
        Set Doc = wdApp.documents.Add()
        
        Doc.Activate
        
        For i = 1 To UBound(Arr)
            
            PosText = Arr(i)
            
            wdApp.Selection.TypeText Text:=PosText
            wdApp.Selection.TypeParagraph
            
            
            If dImageName.Exists(PosText) Then
                '如果含有图片
                If InStr(dImageName(PosText), "|") = 0 Then
                    '如果只含有一张图片
                    ImageName = dImageName(PosText)
                    ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                    wdApp.Selection.InlineShapes.AddPicture FileName:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                    wdApp.Selection.TypeParagraph
                Else
                    ImgNames = Split(dImageName(PosText), "|")
                    For n = LBound(ImgNames) To UBound(ImgNames) Step 1
                        ImageName = ImgNames(n)
                        ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                        wdApp.Selection.InlineShapes.AddPicture FileName:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                        wdApp.Selection.TypeParagraph
                    Next n
                End If
            End If
        Next i
        
        Doc.SaveAs FilePath
        Doc.Close
        
        
        
        Application.StatusBar = ">>>>>>正在删除Image图片>>>>>>"
        
        For Each Key In dImageName.keys
            If InStr(dImageName(Key), "|") = 0 Then
                ImageName = dImageName(Key)
                ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                Kill ImagePath
            Else
                ImgNames = Split(dImageName(Key), "|")
                For n = LBound(ImgNames) To UBound(ImgNames) Step 1
                    ImageName = ImgNames(n)
                    ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                    Kill ImagePath
                Next n
            End If
        Next Key
        
        
        UsedTime = VBA.Timer - StartTime
        Debug.Print FileName; "    "; UsedTime
        'MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒")
        
    ErrorExit:
        If Not wdApp Is Nothing Then wdApp.Quit
        Set wdApp = Nothing
        Set Doc = Nothing
        
        AppSettings False
        Exit Sub
    ErrHandler:
        If Err.Number <> 0 Then
            'MsgBox Err.Description & "!", vbCritical, "QQ 84857038"
            Debug.Print Err.Description
            Err.Clear
            Resume ErrorExit
        End If
    End Sub
    Public Function RegReplace(ByVal OrgText As String, ByVal Pattern As String, Optional RepStr As String = "") As String
    '传递参数 :原字符串, 匹配模式 ,替换字符
        Dim Regex As Object
        Dim newText As String
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        newText = Regex.Replace(OrgText, RepStr)
        RegReplace = newText
        Set Regex = Nothing
    End Function
    Public Sub AppSettings(Optional IsStart As Boolean = True)
        If IsStart Then
            Application.ScreenUpdating = False
            Application.DisplayAlerts = False
            Application.Calculation = xlCalculationManual
            Application.StatusBar = ">>>>>>>>Macro Is Running>>>>>>>>"
        Else
            Application.ScreenUpdating = True
            Application.DisplayAlerts = True
            Application.Calculation = xlCalculationAutomatic
            Application.StatusBar = False
        End If
    End Sub
    

      

  • 相关阅读:
    获取浏览器类型和版本
    js 防抖、节流
    判断数据类型
    一个函数判断数据类型
    面试题3道
    如何处理循环的异步操作
    this的原理以及几种使用场景
    v-model原理解析
    小程序setData数据量过大时候会对渲染有影响吗?
    js中in关键字的使用方法
  • 原文地址:https://www.cnblogs.com/nextseven/p/8469172.html
Copyright © 2020-2023  润新知