• GOCR v0.50 原理分析


    一,简介:

    GOCR是一个c写的开源OCR库,GNU Public License,作者:Joerg Schulenburg

    项目主页:http://jocr.sourceforge.net/index.html

    源代码(v0.50) : http://pan.baidu.com/s/1y1Jj1 (VS2005工程项目)

    Update : http://pan.baidu.com/s/1c0b278O (windows下通过liblept支持jpeg/png等格式的OCR)

    版本(version.h):

    #define version_string "0.50"
    #define release_string "20130305"

    二,原理分析:

    1,GOCR的主要流程如下:

    int pgm2asc(job_t *job)
    {
      pix *pp;
      progress_counter_t *pc;
      static int multi_image_count=0;  /* number of image within multi-image */
      int orig_cs=0; 
      
      if (!multi_image_count) orig_cs = job->cfg.cs; /* save for multi-images */
      
      multi_image_count++;
    
      assert(job);
      /* FIXME jb: remove pp */
      pp = &(job->src.p);
    
      pc = open_progress(100,"pgm2asc_main");
      progress(0,pc); /* start progress output 0% 0% */
    #if 0 /* dont vast memory */
      /* FIXME jb: malloc */
      if ( job->cfg.verbose & 32 ) { 
        // generate 2nd imagebuffer for debugging output
        job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x);     
        // buffer
        assert(job->tmp.ppo.p);
        copybox(&job->src.p,
                0, 0, job->src.p.x, job->src.p.y,
                &job->tmp.ppo,
                job->src.p.x * job->src.p.y);
      }
    #else
      job->tmp.ppo=job->src.p; /* temporarely, removed later */
    #endif
      // if (job->cfg.verbose&32) debug_img("out000.ppm",job,0);
    
      /* ----- count colors ------ create histogram -------
         - this should be used to create a upper and lower limit for cs
         - cs is the optimum gray value between cs_min and cs_max
         - also inverse scans could be detected here later */
      if (orig_cs==0)
        job->cfg.cs=otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
      else  // dont set cs, output stats + do inversion if needed 2010-10-07
        otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
    //  if (job->cfg.verbose&32) debug_img("out001.ppm",job,0);
      /* renormalize the image and set the normalized threshold value */
      job->cfg.cs=thresholding( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.cs );
      if( job->cfg.verbose ) 
        fprintf(stderr, "# thresholding new_threshold= %d
    ", job->cfg.cs);
    //  if (job->cfg.verbose&32) debug_img("out002.ppm",job,0);
    
      progress(5,pc); /* progress is only estimated */
    
      
      /* this is first step for reorganize the PG
         ---- look for letters, put rectangular frames around letters
         letter = connected points near color F
         should be used by dust removing (faster) and line detection!
         ---- 0..cs = black letters, last change = Mai99 */
      
      progress(8,pc); /* progress is only estimated */
    
    //  if (job->cfg.verbose&32) debug_img("out008.ppm",job,8);
      scan_boxes( job, pp );
      if ( !job->res.numC ){ 
        fprintf( stderr,"# no boxes found - stopped
    " );
        if(job->cfg.verbose&32) debug_img("out01",job,8);
        /***** should free stuff, etc) */
        return(1);
      }
      // tmp10/bug100818a.pgm creates artefacts on image
    //  if (job->cfg.verbose&32) debug_img("out00",job,4+8);
    
      progress(10,pc); /* progress is only estimated */
      // if(job->cfg.verbose&32) debug_img("out01",job,4+8);
      // output_list(job);  // for debugging 
      // ToDo: matrix printer preprocessing
    
      remove_dust( job ); /* from the &(job->res.boxlist)! */
    // if(job->cfg.verbose&32) debug_img("out02",job,4+8);
    // output_list(job);  // for debugging 
    #if 0 // ToDo 2010-10-15 destroys QR-barcodes
      smooth_borders( job ); /* only for big chars */
    #endif
      progress(12,pc); /* progress is only estimated */
    // if(job->cfg.verbose&32) debug_img("out03",job,4+8);
    // output_list(job);  // for debugging 
    
      detect_barcode( job );  /* mark barcode */
    // if(job->cfg.verbose&32) debug_img("out04",job,4+8);
    // output_list(job);  // for debugging 
    
      detect_pictures( job ); /* mark pictures */
    //  if(job->cfg.verbose&32) debug_img("out05",job,4+8);
    // output_list(job);  // for debugging 
    
      remove_pictures( job ); /* do this as early as possible, before layout */
    //  if(job->cfg.verbose&32) debug_img("out06",job,4+8);
    // output_list(job);  // for debugging
    
      glue_holes_inside_chars( pp ); /* including count subboxes (holes)  */
    
      detect_rotation_angle( job );
    
    #if 1         /* Rotate the whole picture! move boxes */
      if( job->res.lines.dy!=0 ){  // move down lowest first, move up highest first
        // in work! ??? (at end set dy=0) think on ppo!
      }
    #endif
      detect_text_lines( pp, job->cfg.mode ); /* detect and mark job->tmp.ppo */
    // if(job->cfg.verbose&32) debug_img("out07",job,4+8);
      progress(20,pc); /* progress is only estimated */
    
      add_line_info( job /* , &(job->res.boxlist) */);
      if (job->cfg.verbose&32) debug_img("out10",job,4+8);
    
      divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */
    //  if(job->cfg.verbose&32) debug_img("out11",job,0);
    
      remove_melted_serifs( job, pp ); /* make some corrections on pixmap */
      /* list_ins seems to sort in the boxes on the wrong place ??? */
    //  if(job->cfg.verbose&32) debug_img("out12",job,4+8);
    
      glue_broken_chars( job, pp ); /* 2nd glue */
    //  if(job->cfg.verbose&32) debug_img("out14",job,4+8);
    // 2010-09-24 overall box size is correct here, but later broken
    
      remove_rest_of_dust( job );
    //  if(job->cfg.verbose&32) debug_img("out15",job,4+8);
    
      /* better sort after dust is removed (slow for lot of pixels) */ 
      list_sort(&(job->res.boxlist), sort_box_func);
    
      measure_pitch( job );
    
      if(job->cfg.mode&64) find_same_chars( pp );
      progress(30,pc); /* progress is only estimated */
    //  if(job->cfg.verbose&32) debug_img("out16",job,4+8);
    
      char_recognition( pp, job->cfg.mode);
      progress(60,pc); /* progress is only estimated */
    //  if(job->cfg.verbose&32) debug_img("out17",job,4+8);
    
      if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */
        /* may be, characters/pictures have changed line number */
        list_sort(&(job->res.boxlist), sort_box_func);
        // 2nd recognition call if lines are adjusted
        char_recognition( pp, job->cfg.mode);
      }
    
    #define BlownUpDrawing 0     /* german: Explosionszeichnung, temporarly */
    #if     BlownUpDrawing == 1  /* german: Explosionszeichnung */
    { /* just for debugging */
      int i,ii,ni; struct box *box2;
      i=ii=ni=0;
      for_each_data(&(job->res.boxlist)) { /* count boxes */
        box2 = (struct box *)list_get_current(&(job->res.boxlist));
        if (box2->c==UNKNOWN)  i++;
        if (box2->c==PICTURE) ii++;
        ni++;
      } end_for_each(&(job->res.boxlist)); 
      if (job->cfg.verbose)
        fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d
    ",i,ii,ni);
    }
    #endif
      // ----------- write out20.pgm ----------- mark lines + boxes
      if (job->cfg.verbose&32) debug_img("out20",job,1+4+8);
    
     compare_unknown_with_known_chars( pp, job->cfg.mode);
      progress(70,pc); /* progress is only estimated */
    
        try_to_divide_boxes( pp, job->cfg.mode);
      progress(80,pc); /* progress is only estimated */
    
      /* --- list output ---- for debugging --- */
      if (job->cfg.verbose&6) output_list(job);
    
      /* ---- insert spaces ---- */
      list_insert_spaces( pp , job );
    
      // ---- proof difficult chars Il1 by context view ----
      if (job->cfg.verbose)
        fprintf(stderr,"# context correction if !(mode&32)
    ");
      if (!(job->cfg.mode&32)) context_correction( job );
      
      store_boxtree_lines( job, job->cfg.mode );
      progress(90,pc); /* progress is only estimated */
    
    /* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?)
     * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz
     *  awk 'BEGIN{num=0}/1</box>/{num++;}END{print num}' o
     * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes
     *  9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized
     * 1*1 1*7 not recognized (Oct04)
     *  33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed
     */
    #if     BlownUpDrawing == 1  /* german: Explosionszeichnung */
    { /* just for debugging */
      int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK";
        i=ii=ni=0;
      for_each_data(&(job->res.boxlist)) { /* count boxes */
        box2 = (struct box *)list_get_current(&(job->res.boxlist));
        if (box2->c==UNKNOWN)  i++;
        if (box2->c==PICTURE) ii++;
        if (box2->c>' ' && box2->c<='z') ni++;
      } end_for_each(&(job->res.boxlist)); 
      if(job->cfg.verbose)
        fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni);
      for (i=0;i<20;i++) {
        ni=0;
        for_each_data(&(job->res.boxlist)) { /* count boxes */
          box2 = (struct box *)list_get_current(&(job->res.boxlist));
          if (box2->c==testc[i]) ni++;
        } end_for_each(&(job->res.boxlist)); 
        if(job->cfg.verbose && ni>0)
          fprintf(stderr," (%c)=%d",testc[i],ni);
      }
      if(job->cfg.verbose)
        fprintf(stderr,"
    ");
    }
    #endif
    
      // ---- frame-size-histogram
      // ---- (my own defined) distance between letters
      // ---- write internal picture of textsite
      // ----------- write out30.pgm -----------
      if( job->cfg.verbose&32 ) debug_img("out30",job,2+4);
        
      progress(100,pc); /* progress is only estimated */
    
      close_progress(pc);
      
      return 0;     /* what should I return? error-state? num-of-chars? */
    }

    2,Scan boxes分析:

    流程:从上往下,分别在X,Y轴方向投影,得到box list。

    helloworld

    3,去除噪点:

    /* ---- remove dust ---------------------------------
       What is dust? I think, this is a very small pixel cluster without
       neighbours. Of course not all dust clusters can be detected correct.
       This feature should be possible to switch off via option.
       -> may be, all clusters should be stored here?
       speed is very slow, I know, but I am happy that it is working well
    */

    4,detect barcode and pictures , remove pictures:

    图片:所有box的平均宽度为avgwidth,平均高度为avgheight,符合box.width > 4 * avgwidth || height > 4*avgheight条件并且相近大小的box少于4个的box认为是图像box。

    5,glur holes inside char:

    /* ---- join holes to chars( before step1 ) v0.42  -----------------------
       join boxes lying inside another box (usually holes, ex: "aeobdg46890")
       Dont add dust to a char!
       lines are not detected yet
    */

    6,detect rotation angle:

    /*
    ** Detect rotation angle (one for whole image)
    ** old: longest text-line and determining the angle of this line.
     *
     * search right nearest neighbour of each box and average vectors
     * to get the text orientation,
     * upside down decision is not made here (I dont know how to do it)
     *  ToDo: set job->res.lines.{dx,dy}
     * pass 1: get mean vector to nearest char
     * pass 2: get mean vector to nearest char without outriders to pass 1
     * extimate direction as (dx,dy,num)[pass]
     * ToDo: estimate an error, boxes only work fine for zero-rotation
     *       for 45 degree use vectors, not boxes to get base line
     */

    7,detect text lines:

    http://en.wikipedia.org/wiki/Cap_height

    8,measure pitch:

    估计空格的宽度。

    9,识别字符:

    gocr的识别不是机器学习式的学习,没有training过程,完全靠先验的规则,因此只能识别英文字符,数字,标点等。识别主要是一个filter链路,每个filter决定box是否是该字符,是则略过后续filter。

    a,从box外引出一条射线从某个方向(左,右,上,下)某个坐标(x,y)向box内部,第一个交点位置必须符合某个字符的先验规则;

    ray

    代码:

    /* move from x,y to direction r until pixel of color col is found
     *   or maximum of l steps
     * return the number of steps done */
    int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){ 
      int i=0;
      if(x>=0 && y>=0 && x<p->x && y<p->y){
        switch (r) {
        case UP:
          for( ;i<l && y>=0;i++,y--)
        if( (getpixel(p,x,y)<cs)^col )
          break;
          break;
        case DO:
          for( ;i<l && y<p->y;i++,y++)
        if( (getpixel(p,x,y)<cs)^col )
          break;
          break;
        case LE:
          for( ;i<l && x>=0;i++,x--)
        if( (getpixel(p,x,y)<cs)^col )
          break;
          break;
        case RI:
          for( ;i<l && x<p->x;i++,x++)
        if( (getpixel(p,x,y)<cs)^col )
          break;
          break;
        default:;
        }
      }
      return i;
    }

    b,经过box的一条直线与字符的交点个数必须符合某个字符的先验规则,算法:计算这样的点(如从左向右:Pixel(x,y) = white && Pixel(x+1,y) = black ) 的个数

    line

    代码:

    int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) {
      int rc = 0, col = 0, k, x, y, i, d;    // rc=crossings  col=0=white
      int dx = x1 - x0, dy = y1 - y0;
    
      d = MAX(abs(dx), abs(dy));
      for (i = 0, x = x0, y = y0; i <= d; i++) {
        if (d) {
          x = x0 + i * dx / d;
          y = y0 + i * dy / d;
        }
        k = ((getpixel(p, x, y) < cs) ? 1 : 0);    // 0=white 1=black
        if (col == 0 && k == 1)  // found a white-black transition
          rc++;
        col = k;        // last color
      }
      return rc;
    }

    c,孔洞的个数必须符合某个字符的先验规则,比如A有一个洞;这一步只是判断,实际工作在第5步已经完成。

    hole

    d,如下面识别“{”的代码:

    意思是横穿过dy条线,所有线与字符的交点个数均为1;在字符的前半面,竖直穿过dx/2条线,交点个数均为2,即左凸起部分;等等。

    //  --------- test {} --------------------------------
       for(ad=d=99;dx>2 && dy>5 && 2*dy>3*dx;){
          DBG( wchar_t c_ask='}'; )
          if (!hchar) ad=97*ad/100; 
          for(y=0;y<dy;y++){
            if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break;
          } if (y<dy) Break;
          for(x=0;x<dx/2;x++){
            if( num_cross(x,x,0,dy-1,bp,cs) != 2 ) break;
          } if (y<dx/2) Break;
          if ( num_cross(   0,   0,dy/4,dy-1-dy/4,bp,cs) != 0 ) Break;
          if ( num_cross(dx-1,dx-1,dy/4,dy-1-dy/4,bp,cs) != 1 ) Break;
          i1=loop(bp,dx-1   ,dy/4,dx,cs,0,LE);
          i1=loop(bp,dx-1-i1,dy/4,dx,cs,1,LE); // thickness1
          for (i2=dx,i3=y=dy/2-1-dy/16;y<dy/2+2+dy/16;y++)
           { x=loop(bp,dx-1   , y,dx,cs,0,LE); if (x<i2) {i2=x;i3=y;} }
          i2=  loop(bp,dx-1-i2,i3,dx,cs,1,LE); // thickness2
          if (i2<i1+dx/16+1) Break;
          if ( loop(bp,dx-1,dy-1,dx,cs,0,LE)>3*dx/4 ) {ad=99*ad/100;MSG({})}
          if ( loop(bp,dx-1,   0,dx,cs,0,LE)>3*dx/4 ) {ad=99*ad/100;MSG({})} // >
          if ( loop(bp,dx-1,   0,dy,cs,0,DO)<dy/2-dy/8-1 ) {ad=98*ad/100;MSG({})}
          if ( loop(bp,dx-1,dy-1,dy,cs,0,UP)<dy/2-dy/8-1 ) {ad=98*ad/100;MSG({})} // )
          if ( loop(bp,dx-1,   0,dy,cs,0,DO)<=dy/4) Break;
          if (dy>=8)
          if (   loop(bp,0,   0,dx,cs,0,RI)
             +   loop(bp,0,dy/4,dx,cs,0,RI)
             - 2*loop(bp,0,dy/8,dx,cs,0,RI) >=dx/8 ) {ad=98*ad/100;MSG({})} // <
          if ( loop(bp,1,dy-1,dy,cs,0,UP)>dy/4 ) Break; // ???
          if ( get_bw(x1,x1,y0,y0+dy/4,box1->p,cs,1) == 1 
            || get_bw(x1,x1,y1-dy/4,y1,box1->p,cs,1) == 1 ) Break;
          Setac(box1,(bc='}'),ad);break;
       }

    10,compare_unknown_with_known_chars try_to_divide_boxes等后处理;

    11,list_insert_spaces 插入空格;

    12,store_boxtree_lines;

    13,输出识别结果。

  • 相关阅读:
    tesseract动态库依赖关系
    面向对象分析与设计笔记(一)
    用例图笔记
    矩阵乘法求解
    二维数组 Visual Studio怎么监视
    cmake windows caffe cuda版本的切换
    Python入门
    Python基本数据类型
    【LabVIEW】二进制文件的存储与读取方法
    【LabVIEW】文件对话框点击取消后报错、实现自定义文件名
  • 原文地址:https://www.cnblogs.com/xylc/p/3412954.html
Copyright © 2020-2023  润新知