• 2018-03-01继续完善


    '目前存在的BUG
    '图片补丁存在多个URL
    '题目中间存在小数的问题在正则表达式里加上d+D
    Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
    Private Declare Function DeleteUrlCacheEntry Lib "wininet" Alias "DeleteUrlCacheEntryA" (ByVal lpszUrlName As String) As Long
    Sub DownloadImageName(ByVal ImageURL As String, ByVal ImagePath As String)
        Dim lngRetVal As Long
        lngRetVal = URLDownloadToFile(0, ImageURL, ImagePath, 0, 0)
        If lngRetVal = 0 Then
            DeleteUrlCacheEntry ImageURL  '清除缓存
            'MsgBox "成功"
        Else
            'MsgBox "失败"
        End If
    End Sub
    Sub LoopGetSubject()
        Dim StartTime As Variant
        Dim UsedTime As Variant
        StartTime = VBA.Timer
        
        Dim msg As Variant
        msg = MsgBox("Choose 'Yes' to Continue,Choose 'No' to Exit !", vbYesNo, "AuthorQQ 84857038")
        If msg = vbNo Then Exit Sub
        
        Dim Sht As Worksheet
        Set Sht = ThisWorkbook.ActiveSheet
        With Sht
            EndRow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
            For i = 2 To EndRow
                SetFontRed .Cells(i, 1).Resize(1, 3)
                FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
                ExamUrl = .Cells(i, 2).Text
                Source = .Cells(i, 1).Text
                Source = Replace(Source, "【", "")
                Source = Replace(Source, "】", "")
                Source = Replace(Source, "解析", "")
                Call GetExamTextByUrl(ExamUrl, FindText, Source)
            Next i
        End With
        Set Sht = Nothing
        UsedTime = VBA.Timer - StartTime
        Debug.Print "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
        MsgBox "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
    End Sub
    Sub GetSubject()
        SetFontRed Application.ActiveCell
        FindText = Mid(Application.ActiveCell.Text, 4, Len(Application.ActiveCell.Text) - 8)
        ExamUrl = Application.ActiveCell.Offset(0, -1).Text
        Source = Application.ActiveCell.Offset(0, -2).Text
        Source = Replace(Source, "【", "")
        Source = Replace(Source, "】", "")
        Source = Replace(Source, "解析", "")
        Call GetExamTextByUrl(ExamUrl, FindText, Source)
    End Sub
    Sub GetExamTextByUrl(ByVal ExamUrl As String, ByVal FindText As String, ByVal Source As String)
        Dim Subject As String
        Dim HasImageText As String
        Dim Question As String
        Dim ImageURL As String
        Dim Answer As String
        Dim HasGetContent As Boolean
        Dim docName As String
        Dim docPath As String
        Dim Independent As Boolean
        Dim IsQuestion As Boolean
        Dim IsAnswer As Boolean
        Dim oneP As Object
        Dim nextTag As Object
        
        'send request
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", ExamUrl, False
            .Send
            WebText = .responsetext
            'Debug.Print WebText
            ' Stop
        End With
        With CreateObject("htmlfile")
            .write WebText
            Set examdiv = .getElementById("sina_keyword_ad_area2")
            '获取试卷文本内容
            ExamText = examdiv.innerText
            
            
            '判断试卷是否含有独立答案
            Independent = ExamText Like "*参考答案*"
            'Debug.Print "  Independent "; Independent
            '设定搜集题目Word文档名称和路径
            docName = Application.ActiveSheet.Name & "_题目搜集.doc"
            docPath = ThisWorkbook.Path & "" & docName
            '判断某个段落是否为题目/答案的开始
            IsQuestion = False
            IsAnswer = False
            '判断是否已经提取到内容
            HasGetContent = False
            '循环所有段落
            For Each oneP In .getElementsByTagName("p")
                If HasGetContent = False Then
                    '判断某段内容是否为题号行
                    'If oneP.innerText Like "##.*" Or oneP.innerText Like "##.*" Then
                    If RegTest(oneP.innerText, "(d{1,2})[..]D.*") Then
                        'sep = Right(oneP.innerHTML, 100)
                        ' Debug.Print "''''''''"; oneP.innerHTML
                        '  sep = RealInnerHtml(oneP.innerHTML)
                        ' Debug.Print sep
                        'Debug.Print InStr(WebText, sep)
                        ' Stop
                        
                        sep = RegGetLast(oneP.innerHTML, "([u4e00-u9fa5]{5,})")
                        HasImageText = Split(WebText, FindText)(0)
                        pos = InStrRev(HasImageText, sep)
                        HasImageText = Mid(HasImageText, pos)
                        Debug.Print ">>>>>汉字分隔符>>>"; sep
                        Debug.Print HasImageText
                        
                        ' Debug.Print WebText
                        'Stop
                        'Debug.Print ">>>>>>>>"; partText
                        'Debug.Print "Sep》》》》"; UCase(sep)
                        'Stop
                        Subject = ""
                        Question = ""
                        ImageURL = ""
                        Answer = ""
                        '开始记录题干内容
                        Subject = oneP.innerText
                        'Debug.Print OneP.innerText
                    Else
                        If InStr(oneP.innerText, FindText) = 0 Then
                            '过滤不相干的问题,仅保留符合条件的问题
                            If Not RegTest(oneP.innerText, "([((]d[))]).*") Then
                                '继续记录问题内容
                                Subject = Subject & oneP.innerText
                            End If
                        End If
                    End If
                    '提取题目图片的地址
                    '直接获取innerhtml
                    
                    '  Set nextTag = oneP.NextSibling
                    '  If Not nextTag Is Nothing Then
                    '   If UCase(nextTag.tagName) = "A" Then
                    '    If nextTag.HasChildNodes Then
                    '        If nextTag.href Like "http://photo.blog.sina.com.cn/showpic.html*" Then
                    '           ImageURL = ImageURL & "|" & nextTag.FirstChild.getAttribute("real_src")
                    '            Debug.Print ImageURL
                    '        End If
                    '      End If
                    ' End If
                    'End If
                    'Stop
                    '提取题目的序号和问题的序号
                    If InStr(oneP.innerText, FindText) > 0 Then
                        SubjectIndex = RegGet(Subject, "(d{1,2})[..]D.*")
                        Question = oneP.innerText
                        questionIndex = RegGet(Question, "[((](d)[))].*")
                        
                        'Debug.Print "题序:"; SubjectIndex; "   问序: "; questionIndex
                        HasGetContent = True
                    End If
                    
                Else
                    '提取内容后 开始找答案
                    '试卷不含独立答案,答案就附在每道题后面
                    If Independent = False Then
                        
                        If IsAnswer = False Then
                            If RegTest(oneP.innerText, "[((](" & questionIndex & ")[))].*") Then
                                Answer = oneP.innerText
                                IsAnswer = True
                                'Exit For
                            End If
                        Else
                            Debug.Print oneP.innerText
                            If RegTest(oneP.innerText, "[((](d)[))].*") Or RegTest(oneP.innerText, "(d{1,2})[..]D.*") Then
                                Exit For
                            Else
                                Answer = Answer & oneP.innerText
                            End If
                        End If
                        
                        
                        
                        
                    Else
                        '试卷还有独立参考答案
                        '判断某段内容的题号是否符合条件
                        If RegTest(oneP.innerText, "(" & SubjectIndex & ")[..].D*") Then
                            IsQuestion = True
                            'Debug.Print isQuestion
                        End If
                        If IsQuestion = True Then
                            '判断某段内容的问题序号是否符合条件
                            If IsAnswer = False Then
                                If RegTest(oneP.innerText, "([((]" & questionIndex & "[))]).*") Then
                                    '记录问题答案
                                    Answer = oneP.innerText
                                    IsAnswer = True
                                    'Exit For
                                End If
                            Else
                                Debug.Print oneP.innerText
                                If RegTest(oneP.innerText, "[((](d)[))].*") Or RegTest(oneP.innerText, "(d{1,2})[..]D.*") Then
                                    Exit For
                                Else
                                    Answer = Answer & oneP.innerText
                                End If
                            End If
                        End If
                    End If
                End If
            Next oneP
            '图片地址处理
            ' ImageURL = Mid(ImageURL, 2)
            '测试
            
            'Debug.Print ImageURL
            Debug.Print Question
            Debug.Print Answer
        End With
        '<span style="font-family:">43.</span>
        '【补丁,有待改进】2017年下半年部分图片提取不到的问题修正
        ImageURL = ""
        If Len(ImageURL) = 0 Then
            Debug.Print "补丁"
            
            
            'Debug.Print "____________________"; hasimagetext
            'Stop
            'Debug.Print InStr(HasImageText, sep)
            '   HasImageText = Split(HasImageText, sep)(1)
            Debug.Print ">>>>>>>>>>>>>>>>>>"; HasImageText
            'Debug.Print InStr(HasImageText, "real_src")
            'HasImageText = UCase(HasImageText)
            'Debug.Print RegTest(HasImageText, "real_src =""(http.*?)""")
            
            imgs = RegGetArray(HasImageText, "real_src =""(http.*?)""")
            
            For n = LBound(imgs) To UBound(imgs) Step 1
                'Debug.Print imgs(n)
                ImageURL = ImageURL & "|" & imgs(n)
            Next n
            
            'Stop
            ImageURL = Mid(ImageURL, 2)
            Debug.Print "所有图片地址:"; ImageURL
            'hasimagetext = Split(hasimagetext, "real_src")(UBound(Split(hasimagetext, "real_src")))
            'ImageURL = Split(hasimagetext, """")(1)
        End If
        
        '输出题目内容到Word文档
        Dim wdApp As Object
        Dim Doc As Object
        
        On Error Resume Next
        Set wdApp = GetObject(, "Word.Application")
        On Error GoTo 0
        If Not wdApp Is Nothing Then
            wdApp.Visible = True
            On Error Resume Next
            Set Doc = wdApp.documents(docName)
            On Error GoTo 0
            If Doc Is Nothing Then
                Set Doc = wdApp.documents.Add()
                Doc.SaveAs docPath
            End If
        Else
            Set wdApp = CreateObject("Word.Application")
            wdApp.Visible = True
            
            If Dir(docPath) <> "" Then
                Set Doc = wdApp.documents.Open(docPath)
            Else
                Set Doc = wdApp.documents.Add()
                Doc.SaveAs docPath
            End If
        End If
        
        Doc.Activate
        wdApp.Selection.EndKey 6
        wdApp.Selection.TypeParagraph
        wdApp.Selection.InsertBreak 7
        '输出题干内容
        Debug.Print Subject
        Subject = RegReplace(Subject, "(" & SubjectIndex & "[..])") & "."
        Debug.Print Subject
        'Stop
        wdApp.Selection.TypeText Text:=Subject
        wdApp.Selection.TypeParagraph
        
        '下载图片并插入WORD文档
        If ImageURL <> "" Then
            If InStr(ImageURL, "|") = 0 Then
                ImagePath = ThisWorkbook.Path & Application.PathSeparator & "tmp.jpg"
                DownloadImageName ImageURL, ImagePath
                wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                wdApp.Selection.TypeParagraph
                Kill ImagePath
                'Stop'
            Else
                ImageURLs = Split(ImageURL, "|")
                For n = LBound(ImageURLs) To UBound(ImageURLs) Step 1
                    ImagePath = ThisWorkbook.Path & Application.PathSeparator & n & "tmp.jpg"
                    DownloadImageName ImageURLs(n), ImagePath
                    Debug.Print ImagePath
                    wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                    wdApp.Selection.TypeParagraph
                    Kill ImagePath
                Next n
            End If
        End If
        '输出问题内容
        wdApp.Selection.TypeText Text:=RegReplace(Question, "([((]d[))])")
        wdApp.Selection.TypeParagraph
        '输出答案内容
        
        sp = RegGet(Answer, "([((]" & questionIndex & "[))]).*")
        'Debug.Print Sp
        If Len(sp) > 0 Then
            Answer = Split(Answer, sp)(1)
            sp = RegGet(Answer, "([((]" & questionIndex + 1 & "[))]).*")
            If Len(sp) > 0 Then
                Answer = Split(Answer, sp)(0)
            End If
        End If
        Debug.Print Answer
        Answer = RegReplace(Answer, "(【来源】.*)")
        Answer = RegReplace(Answer, "(【解析】.*)")
        Debug.Print Answer
        'Stop
        wdApp.Selection.TypeText Text:="【答案】" & Answer
        
        wdApp.Selection.TypeParagraph
        wdApp.Selection.TypeText Text:="[ 来源:" & Source & " 第" & SubjectIndex & "题 第(" & questionIndex & ")问 ]"
        wdApp.Selection.TypeParagraph
        
        Set wdApp = Nothing
        Set Doc = Nothing
        Set oneP = Nothing
    End Sub
    Private Function RegTest(ByVal OrgText As String, ByVal Pattern As String) As Boolean
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        RegTest = Regex.test(OrgText)
        Set Regex = Nothing
    End Function
    Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Dim Mh As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        If Regex.test(OrgText) Then
            Set Mh = Regex.Execute(OrgText)
            RegGet = Mh.Item(0).submatches(0)
        Else
            RegGet = ""
        End If
        Set Regex = Nothing
    End Function
    Sub SetFontRed(ByVal Rng As Range)
        With Rng.Font
            .Color = -16776961
            .TintAndShade = 0
        End With
    End Sub
    Public Function RegReplace(ByVal OrgText As String, ByVal Pattern As String, Optional RepStr As String = "") As String
    '传递参数 :原字符串, 匹配模式 ,替换字符
        Dim Regex As Object
        Dim newText As String
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        newText = Regex.Replace(OrgText, RepStr)
        RegReplace = newText
        Set Regex = Nothing
    End Function
    
    
    Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
        Dim Reg As Object, Mh As Object, OneMh As Object
        Dim Arr() As String, Index As Long
        Dim Elm As String
        Set Reg = CreateObject("Vbscript.Regexp")
        With Reg
            .MultiLine = True
            .Global = True
            .Ignorecase = False
            .Pattern = Pattern
            Set Mh = .Execute(OrgText)
            
            Index = 0
            ReDim Arr(1 To 1)
            For Each OneMh In Mh
                Index = Index + 1
                ReDim Preserve Arr(1 To Index)
                'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
                Arr(Index) = OneMh.submatches(0)
                'Debug.Print OneMh.submatches(0)
            Next OneMh
        End With
        RegGetArray = Arr
        Set Reg = Nothing
        Set Mh = Nothing
    End Function
    
    Function RealInnerHtml(ByVal OrgInnerHtml) As String
          Dim x As String
          x = OrgInnerHtml
          x = Replace(x, "SPAN", "span")
          x = Replace(x, "FONT-SIZE", "font-size")
          x = Replace(x, "FONT-FAMILY", "font-family")
          x = Replace(x, "FONT", "font")
          x = Replace(x, "WBR", "wbr")
          x = Replace(x, "COLOR", "color")
          RealInnerHtml = x
    End Function
    Public Function RegGetLast(ByVal OrgText As String, ByVal Pattern As String) As String
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Dim Mh As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        If Regex.test(OrgText) Then
            Set Mh = Regex.Execute(OrgText)
            'RegGetLast = Mh.Item(0).submatches(0)
            For Each OneMh In Mh
                RegGetLast = OneMh.submatches(0)
            Next OneMh
        Else
            RegGetLast = ""
        End If
        Set Regex = Nothing
    End Function
    

      

  • 相关阅读:
    springcloud -- sleuth+zipkin整合rabbitMQ详解
    docker 更新后出现 error during connect
    springcloud --- spring cloud sleuth和zipkin日志管理(spring boot 2.18)
    kotlin系列文章 --- 3.条件控制
    kotlin -- 可见性修饰符
    kotlin系列文章 --- 2.基本语法
    kotlin系列文章 --- 1.初识kotlin
    oracle索引失效总结
    mysql常用命令行操作
    JavaFX简介和Scene Builder工具的安装使用简易教程
  • 原文地址:https://www.cnblogs.com/nextseven/p/8486417.html
Copyright © 2020-2023  润新知