• PDF数据提取------3.解析Demo


    1.PDF中文本字符串格式中关键值信息抓取(已完成)

        简介:这种解析比较传统最简单主要熟练使用Regular Expression做语义识别和验证.例如抓取下面红色圈内关键信息

    1.1

            string mettingData=GetMeetingData();       
    
            public string GetMeetingData()
            {
                string patternAll = @"(?<NDAandCAMDate>会s*议s*.{2,15}d{2,4}s*年s*d{1,2}s*月s*d{1,2}s*日.{0,15})";
                PdfAnalyzer pa = new PdfAnalyzer();
                PDFNet.Initialize();
                PDFDoc doc = new PDFDoc(item);
                doc.InitSecurityHandler();
                List<PdfString> foundAll = pa.RegexSearchAllPages(doc, patternAll);
    
                List<string> patternFilter = new List<string>();
                patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|()(星期|周)(一|二|三|四|五|六|七)()|)))?(上午)?(?<hour>d{1,2})(:|点|时)(?<minute>d{1,2})");
                patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|()(星期|周)(一|二|三|四|五|六|七)()|)))?下午(?<hour>d{1,2})(:|点|时)(?<minute>d{1,2})");
                patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|()(星期|周)(一|二|三|四|五|六|七)()|)))?(上午)?(?<hour>d{1,2})点半");
                patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|()(星期|周)(一|二|三|四|五|六|七)()|)))?下午(?<hour>d{1,2})点半");
                patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|()(星期|周)(一|二|三|四|五|六|七)()|)))?(上午)?(?<hour>d{1,2})(点|时)");
                patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|()(星期|周)(一|二|三|四|五|六|七)()|)))?下午(?<hour>d{1,2})(点|时)");
                patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日");
    
                return GetMeetingDateFilter(foundAll, patternAll);
            }
    
            private string GetMeetingDateFilter(List<PdfString> foundAll, List<string> patternAll)
            {
                string meetingDate = "     ";
                Match ma = null;
                string result = string.Empty;
    
                foreach (PdfString pdfString in foundAll)
                {
                    result = pdfString.ToString().Replace(" ", "");
                    for (int i = 0; i < patternAll.Count; i++)
                    {
                        ma = (new Regex(patternAll[i])).Match(result);
                        if (ma.Success)
                        {
                            if (IsValid(ma))
                                return meetingDate;
                            else
                                meetingDate = "     ";
                        }
                    }
                }
                return meetingDate;
            }

    注解:

          a.第一次通过通过 pa.RegexSearchAllPages(doc, patternAll);搜索所有关于时间数据信息

          b.第二次通过正则匹配获取带有关键词信息Meeting Data

    2.PDF类似表格形式关键值数据抓取。(已完成)

               简介:这种格式需要用的封装数据结构PdfString类和PdfAnalyzer类,根据给定关键词在指定范围提取数据,例如提取下面数据。

    2.1

    private string GetPremium(string path, string ricCode)
            {
                string result = string.Empty;
                PDFDoc doc = null;
                try
                {
                    PDFNet.Initialize();
                    doc = new PDFDoc(path);
                    doc.InitSecurityHandler();
    
                    if (doc == null)
                    {
                        string msg = string.Format("can't load pdf to doc = new PDFDoc({0}); ", path);
                        Logger.Log(msg, Logger.LogType.Error);
                        return result;
                    }
    
                    int x1 = 0;
                    int y1 = 0;
                    PdfAnalyzer pa = new PdfAnalyzer();
                    List<PdfString> listX1 = pa.RegexSearchAllPages(doc, ricCode);
                    List<PdfString> listY1 = pa.RegexSearchAllPages(doc, @"[P|p]remium");
                    List<PdfString> listResult = pa.RegexSearchAllPages(doc, @"(?<Result>d+.d+\%)");
    
                    if (listX1.Count == 0 || listY1.Count == 0 || listResult.Count == 0)
                    {
                        string msg = string.Format("({0}),([P|p]remium) exist missing value ,so Gearing is empty value.", ricCode);
                        Logger.Log(msg, Logger.LogType.Warning);
                        return result;
                    }
    
                    x1 = System.Convert.ToInt32(listX1[0].Position.x1);
                    y1 = System.Convert.ToInt32(listY1[0].Position.y1);
    
                    int subX1 = 0;
                    int subY1 = 0;
                    //use Gearing position (x1,y1) to get the right result value
                    foreach (var item in listResult)
                    {
                        subX1 = x1 - System.Convert.ToInt32(item.Position.x1);
                        if (subX1 < 0) subX1 = 0 - subX1;
                        subY1 = y1 - System.Convert.ToInt32(item.Position.y1);
                        if (subY1 < 0) subY1 = 0 - subY1;
    
                        if (subX1 <= 10 && subY1 <= 10)
                        {
                            result = item.ToString().Replace("%", "");
                            return result;
                        }
                    }
    
                    Logger.Log(string.Format("stock code:{0},extract premium failed .", ricCode), Logger.LogType.Error);
                    return result;
                }
                catch (Exception ex)
                {
                    string msg = string.Format("PDF analysis failed for " + ricCode + "! Action: Need manually input gearing and premium 
     error msg:{0}", ex.Message);
                    Logger.Log(msg, Logger.LogType.Warning);
                    return result;
                }
            }

    3.需要PDF中大量数据转换到Excel中去 (已完成)

               简介:基与2的延伸,加入一个自动模糊匹配到行和列边界范围,根据位置坐标排序提取正确数据信息。如图:

    2.22.3

    private void StartExtractFile()
            {
                List<List<string>> bulkFileFilter = null;
                List<LineFound> bulkFile = null;
                PDFNet.Initialize();
                PDFDoc doc = new PDFDoc(config.FilePath1);
                doc.InitSecurityHandler();
                string patternTitle = @"コード";
                int page = 3;
                PdfString ricPosition = GetRicPosition(doc, patternTitle, page);
                if (ricPosition == null)
                    return;
    
                string patternRic = @"d{4}";
                string patternValue = @"(-|+)?d+(\,|.|d)+";
                bulkFile = GetValue(doc, ricPosition, patternRic, patternValue);
                int indexOK = 0;
                bulkFileFilter = FilterBulkFile(bulkFile, indexOK);
                string filePath = Path.Combine(config.OutputFolder, string.Format("Type1ExtractedFromPdf{0}.csv", DateTime.Now.ToString("dd-MM-yyyy")));
    
                if (File.Exists(filePath))
                    File.Delete(filePath);
    
                XlsOrCsvUtil.GenerateStringCsv(filePath, bulkFileFilter);
                AddResult(Path.GetFileNameWithoutExtension(filePath), filePath, "type1");
            }
    
            private List<List<string>> FilterBulkFile(List<LineFound> bulkFile, int indexOK)
            {
                List<List<string>> result = new List<List<string>>();
    
                if (bulkFile == null || bulkFile.Count == 0)
                {
                    Logger.Log("no value data extract from pdf");
                    return null;
                }
                int count = bulkFile[indexOK].LineData.Count;
    
                List<string> line = null;
                foreach (var item in bulkFile)
                {
                    if (item.LineData == null || item.LineData.Count <= 0)
                        continue;
    
                    line = new List<string>();
                    if (item.LineData.Count.CompareTo(count) == 0)
                    {
                        foreach (var value in item.LineData)
                        {
                            line.Add(value.Words.ToString());
                        }
                    }
                    else
                    {
                        line.Add(item.LineData[0].Words.ToString());
                        for (int i = 1; i < count; i++)
                        {
                            line.Add(string.Empty);
                        }
                    }
                    result.Add(line);
                }
    
                return result;
            }
    
            private List<LineFound> GetValue(PDFDoc doc, PdfString ricPosition, string patternRic, string patternValue)
            {
                List<LineFound> bulkFile = new List<LineFound>();
                try
                {
                    List<string> line = new List<string>();
                    List<PdfString> ric = null;
    
                    //for (int i = 1; i < 10; i++)
                    for (int i = 1; i < doc.GetPageCount(); i++)
                    {
                        ric = pa.RegexExtractByPositionWithPage(doc, patternRic, i, ricPosition.Position);
                        foreach (var item in ric)
                        {
                            LineFound lineFound = new LineFound();
                            lineFound.Ric = item.Words.ToString();
                            lineFound.Position = item.Position;
                            lineFound.PageNumber = i;
                            lineFound.LineData = pa.RegexExtractByPositionWithPage(doc, patternValue, i, item.Position, PositionRect.X2);
                            bulkFile.Add(lineFound);
                        }
                    }
                }
                catch (Exception ex)
                {
                    string msg = string.Format("
             ClassName:  {0}
             MethodName: {1}
             Message:    {2}",
                                                System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString(),
                                                System.Reflection.MethodBase.GetCurrentMethod().Name,
                                                ex.Message);
                    Logger.Log(msg, Logger.LogType.Error);
                }
    
                return bulkFile;
            }
    
            private PdfString GetRicPosition(PDFDoc doc, string pattern, int page)
            {
                try
                {
                    List<PdfString> ricPosition = null;
                    ricPosition = pa.RegexSearchByPage(doc, @"コード", page);
                    if (ricPosition == null || ricPosition.Count == 0)
                    {
                        Logger.Log(string.Format("there is no ric title found by using pattern:{0} to find the ric title ,in the page:{1} of the pdf:{2}"));
                        return null;
                    }
    
                    return ricPosition[0];
                }
                catch (Exception ex)
                {
                    string msg = string.Format("
             ClassName:  {0}
             MethodName: {1}
             Message:    {2}",
                                                System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString(),
                                                System.Reflection.MethodBase.GetCurrentMethod().Name,
                                                ex.Message);
                    Logger.Log(msg, Logger.LogType.Error);
                    throw;
                }
            }
        }
    
        struct LineFound
        {
            public string Ric { get; set; }
            public Rect Position { get; set; }
            public int PageNumber { get; set; }
            public List<PdfString> LineData { get; set; }
        }

    注解:

           a.由于PDF中数据坐标位置信息是基于页的所以必须按页来解析抓取数据

           b.大概思路,第一次获取“コード”位置,来获取每页中Ric List的集合(获取列并排序)

           c.根据每一列信息获取每一行信息(获取并排序),组合成表格信息

    改进:

           现在这部分还需要代码中手动干预,下一步打算加入自动识别功能,通过获取大量PDF数据自动根据位置信息组合成Table信息

    4.PDF中数据保存图片格式(未完成)

          想法:这种PDF文件我目前还没好的处理办法,应该需要用到图像识别方面的算法。对着这种文件格式表示我现在确实无能为力,

          希望那位大神提供一些好的建议。  

  • 相关阅读:
    语句
    C#语言基础
    进制转换
    js对URL的相关操作集锦
    js/javascript计时器方法及使用场景
    js中FormData+XMLHttpRequest数据传输
    HTML中footer固定在页面底部的若干种方法
    js/jquery 禁用点击事件
    asp.net微信开发第七篇----高级群发(图文)
    asp.net微信开发第六篇----高级群发(文本)
  • 原文地址:https://www.cnblogs.com/HaifengCai/p/3960039.html
Copyright © 2020-2023  润新知