• ISBN数字识别


    ISBN识别

    学校三级项目需要批量识别ISBN中的数字

    实现的大致思路如下:

    对原始图片按尺寸自动调整大小,高斯滤波去噪,灰度化,二值化,边缘检测后闭操作,查找最大轮廓,获取最小外接矩形及旋转角度,旋转摆正图片,水平投影,提取字符区域,用pytesseract识别字符

    项目代码存放在三个文件中

    1.工具类ocr_tool.py

    import re
    from difflib import SequenceMatcher
    
    
    # 提取字符串中的数字
    def obtain_digit(data):
        s = re.findall(r"\d+", data)
        return ''.join(s)
    
    
    # 统计正确识别的数字个数
    def crct_digit_cnt(crct_isbn, recog_isbn):
        return SequenceMatcher(None, crct_isbn, recog_isbn).find_longest_match(0, len(crct_isbn), 0, len(recog_isbn)).size
    
    

    2.图片预处理相关函数img_process_tool.py

    from math import fabs, sin, radians, cos
    
    import cv2 as cv
    import numpy as np
    
    
    def img_show_wait(img, window_name, duration=100):
        """
        显示图片
        :param img:输入图片
        :param window_name: 显示图片的窗口名称
        :param duration: 显示图片的时长,默认等待键入任意按键,不自动关闭窗口
        """
        cv.imshow(window_name, img)
        cv.waitKey(duration)
    
    
    def get_projection_list(binary_img, direction='horizontal'):
        """
        获取指定方向的投影
        :param binary_img: 输入的二值图
        :param direction: 投影方向
        :return: 投影方向上的像素统计图
        """
        h, w = binary_img.shape[:2]
        row_list = [0] * h
        col_list = [0] * w
        for row in range(h):
            for col in range(w):
                if binary_img[row, col] == 255:  # 统计白色像素点个数
                    row_list[row] = row_list[row] + 1
                    col_list[col] = col_list[col] + 1
        if direction == 'horizontal':
            return row_list
        else:
            return col_list
    
    
    def draw_projection(data_list, rows, cols, direction='horizontal'):
        """
        绘制指定方向上的投影图并返回
        :param data_list: 用于绘制投影的数据
        :param rows: 原始图像的行数
        :param cols: 原始图像的列数
        :param direction: 指定水平或者垂直方向投影
        :return: 投影图像
        """
        img_proj = np.ones(shape=(rows, cols), dtype=np.uint8) * 255
        row_max = np.max(data_list)
        if direction == 'horizontal':
            # 绘制水平投影图
            weight = cols / row_max
            for row in range(rows):
                pt1 = (0, row)
                pt2 = (int(weight * data_list[row]), row)
                cv.line(img_proj, pt1, pt2, (0,), 1)
            img_show_wait(img_proj, 'horizontal projection')
        else:
            # 绘制垂直投影图
            weight = rows / row_max
            for col in range(cols):
                pt1 = (col, rows - 1)
                pt2 = (col, rows - 1 - int(weight * data_list[col]))
                cv.line(img_proj, pt1, pt2, (0,), 1)
            img_show_wait(img_proj, 'vertical projection')
        return img_proj
    
    
    def split_projection_list(proj_list: list, min_val=0):
        """
        将投影得到的像素统计区间分割出像素集中区域,返回ROI区域的坐标区间
        :param proj_list: 投影统计数据
        :param min_val: 用于划定区间的一个阈值
        :return: ROI区域的坐标区间
        """
        start = 0
        end = None
        split_list = []
        for idx, value in enumerate(proj_list):
            if value > min_val:
                end = idx
            else:
                if end is not None:
                    split_list.append((start, end))
                    end = None
                start = idx
        return split_list
    
    
    def img_rotate(img, degree):
        """
        对图片进行旋转
        :param img:输入图片
        :param degree: 旋转角度
        :return:
        """
        height, width = img.shape[:2]
        heightNew = int(width * fabs(sin(radians(degree))) + height * fabs(cos(radians(degree))))  # 扩充画布
        widthNew = int(height * fabs(sin(radians(degree))) + width * fabs(cos(radians(degree))))  # 扩充画布
        matRotation = cv.getRotationMatrix2D((width // 2, height // 2), degree, 1)  # 获取旋转矩阵
        matRotation[0, 2] += (widthNew - width) // 2  # 旋转后平移
        matRotation[1, 2] += (heightNew - height) // 2  # 旋转后平移
        imgRotation = cv.warpAffine(img, matRotation, (widthNew, heightNew), borderValue=(255, 255, 255))  # 获取旋转后的图片
        return imgRotation, matRotation
    
    
    def draw_box(img, box):
        """
        在指定图片上画矩形框
        :param img: 输入图片
        :param box: 矩形框的坐标,一个长度为8的坐标序列
        :return:
        """
        cv.line(img, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 3)
        cv.line(img, (box[2], box[3]), (box[4], box[5]), (0, 255, 0), 3)
        cv.line(img, (box[0], box[1]), (box[6], box[7]), (0, 255, 0), 3)
        cv.line(img, (box[4], box[5]), (box[6], box[7]), (0, 255, 0), 3)
        return img
    
    
    def img_resize(img_original):
        """
        根据给定图片的尺寸自动调整为相应的大小
        :param img_original: 输入图片
        :return:
        """
        rows, cols = img_original.shape[:2]
        if rows > 1300:
            img_original = cv.resize(img_original, None, fx=0.25, fy=0.25, interpolation=cv.INTER_CUBIC)  # 调整大小
        elif 750 < rows <= 1300:
            img_original = cv.resize(img_original, None, fx=0.5, fy=0.5, interpolation=cv.INTER_CUBIC)  # 调整大小
        elif 500 < rows <= 750:
            img_original = cv.resize(img_original, None, fx=0.75, fy=0.75, interpolation=cv.INTER_CUBIC)  # 调整大小
        return img_original
    
    
    def adaptive_threshold(gray, block_size=5, c=10, inv=False):
        """
        对给定图片进行二值化处理
        :param gray: 输入的灰度图
        :param block_size: 卷积核大小
        :param c:
        :param inv: 是否反转,默认情况下,不反转,字符区域显示为白色,不相关区域显示为黑色
        :return: 返回二值化后的图像
        """
        if not inv:
            thresholdType = cv.THRESH_BINARY
        else:
            thresholdType = cv.THRESH_BINARY_INV
        # 自适应阈值化能够根据图像不同区域亮度分布,改变阈值
        binary_img = cv.adaptiveThreshold(
            gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, thresholdType, block_size, c)
        return binary_img
    
    
    def img_preprocess(img, kernel=None):
        """
        对图片进行预处理,包括高斯滤波去噪,转为灰度图,自适应二值化处理,闭运算处理
        :param img: 输入图片
        :param kernel: 卷积核
        :return: 预处理后得到的图像
        """
        img_blur = cv.GaussianBlur(img, (3, 3), 0)  # 高斯滤波进行去噪
        img_gray = cv.cvtColor(img_blur, cv.COLOR_BGR2GRAY)  # 转为灰度图
        ret, img_bin = cv.threshold(img_gray, 0, 255, cv.THRESH_BINARY_INV + cv.THRESH_OTSU)  # 自适应二值化
        img_canny = cv.Canny(img_bin, 50, 50)  # 边缘检测
        if kernel is None:
            kernel = cv.getStructuringElement(cv.MORPH_RECT, (5, 5), (-1, -1))
        img_close = cv.morphologyEx(img_canny, cv.MORPH_CLOSE, kernel)  # 先膨胀后腐蚀,减少连通区域(闭运算)
        img_show_wait(img_close, 'original img close')
        return img_close
    
    
    if __name__ == '__main__':
        # 测试图片切割序列
        path_to_img = r'D:\projects_python\workingon\isbnocr\pageocr\xxx.png'
        img_original = cv.imread(path_to_img)
        img_pre = img_preprocess(img_original)
        horizontal_projection_list = get_projection_list(img_pre, 'horizontal')
        rows, cols = img_pre.shape[:2]
        draw_projection(horizontal_projection_list, rows, cols, 'horizontal')
    
    

    3.核心代码isbnocr.py

    import os
    from os import listdir
    
    import pytesseract
    
    from img_process_tool import *
    from ocr_tool import *
    
    
    def img_isbn_area(img_original):
        """
        对原始图片倾斜摆正后提取出原始图片中的字符区域
        :param img_original: 输入图片
        :return: 提取出的字符区域
        """
        kernel = cv.getStructuringElement(cv.MORPH_RECT, (100, 5), (-1, -1))
        img_pre = img_preprocess(img_original, kernel)
        # 查找最大轮廓
        contours, hierarchy, = cv.findContours(img_pre, 1, 2)
        length = len(contours)
        index = 0  # 存放最大轮廓的索引
        max_area = cv.contourArea(contours[index])  # 存放最大轮廓的面积
        for i in range(length):
            cnt = contours[i]
            area = cv.contourArea(cnt)
            if area > max_area:
                index = i
                max_area = area
        # 利用最大轮廓计算倾斜角度,将图片摆正
        cnt = contours[index]
        min_area_rect = cv.minAreaRect(cnt)  # 获取最小外接矩形,返回一个rect,ndarray类型
        angle = min_area_rect[2]  # 获取旋转角度
        if angle > 80:
            angle = min_area_rect[2] - 90  # 调整旋转角度
        img_rotated, mat_rotation = img_rotate(img_original, angle)  # 开始旋转
        img_rotated_pre = img_preprocess(img_rotated, kernel)
        # 水平投影
        proj_list = get_projection_list(img_rotated_pre)
        split_list = split_projection_list(proj_list, 0)
        img_h, img_w = img_rotated_pre.shape[:2]
        x, y, w, h = 0, 0, img_w, img_h
        for start, end in split_list:
            if end - start > img_h * 0.5:
                continue
            x, y, w, h = 0, start, img_w, end - start
            roi = img_rotated[y:y + h, x:x + w]
            img_show_wait(roi, 'roi')
            proj_list = get_projection_list(img_rotated_pre, 'vertical')
            split_vertical_list = split_projection_list(proj_list, 0)
            if len(split_vertical_list) < 12:
                continue
        return img_rotated[y:y + h, x:x + w]
    
    
    def split_digits(img_text):
        """
        对提取出的字符区域进行垂直投影,切割出单个字符区域
        :param img_text: 输入图像
        :return: 单个字符区域
        """
        isbn_gray = cv.cvtColor(img_text, cv.COLOR_BGR2GRAY)
        ret, isbn_bin = cv.threshold(isbn_gray, 0, 255, cv.THRESH_OTSU + cv.THRESH_BINARY_INV)
        img_show_wait(isbn_bin, 'isbn area bin')
        ver_proj_list = get_projection_list(isbn_bin, 'vertical')
        rows, cols = img_text.shape[:2]
        draw_projection(ver_proj_list, rows, cols, 'vertical')
        digit_col_list = split_projection_list(ver_proj_list)
        digits = []
        for i in range(len(digit_col_list)):
            digit_col = digit_col_list[i]
            digits.append(img_text[:, digit_col[0]:digit_col[1]])
        return digits
    
    
    def digit_recog(path_to_image):
        """
        识别单张图片中的ISBN编号
        :param path_to_image: 图片绝对路径
        :return:
        """
        img_original = cv.imread(path_to_image)
        img_resized = img_resize(img_original)
        isbn_area = img_isbn_area(img_resized)
        recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
        print(f'【图片{os.path.basename(path_to_image)}】的识别结果为:{recog_isbn}')
        if isbn_area is not None:
            digits = split_digits(isbn_area)
            if digits is not None:
                for i in range(len(digits)):
                    img_show_wait(digits[i], 'digit')
    
    
    def digit_recog_batch(path_to_images):
        """
        批量识别图片
        :param path_to_images: 图片存放的文件夹绝对路径
        :return:
        """
        img_cnt = 0  # 图片数量
        digit_cnt = 0  # 数字数量
        digit_recognized = 0  # 正确识别数字
        isbn_recognized = 0  # 正确识别完整ISBN
        for file in listdir(path_to_images):
            img_abs_path = os.path.join(path_to_images, file)  # 图像的绝对路径
            img_basename = os.path.basename(img_abs_path)  # 图片的文件名
            original_image = cv.imread(img_abs_path, cv.IMREAD_COLOR)
            isbn_area = img_isbn_area(img_resize(original_image))  # 获取图片中的字符区域
            if isbn_area is not None:
                recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
                print(f'【图片{img_basename}】的识别结果为:{recog_isbn}')
                if recog_isbn:
                    img_cnt += 1
                    digit_cnt += len(obtain_digit(img_basename))
                    digit_recognized += crct_digit_cnt(obtain_digit(img_basename), recog_isbn)
                    isbn_recognized += 1 if recog_isbn.find(obtain_digit(file)) != -1 else 0
    
        print("正确识别的ISBN个数:" + str(isbn_recognized) + "/" + str(img_cnt))
        print("正确识别的数字个数:" + str(digit_recognized) + "/" + str(digit_cnt))
        print("识别正确率:" + str(isbn_recognized / img_cnt))
        print("识别准确率:" + str(digit_recognized / digit_cnt))
    
    
    # todo:字符识别可以用神经网络或者模板匹配
    if __name__ == "__main__":
        pytesseract.pytesseract.tesseract_cmd = r"D:\software\Tesseract-OCR\tesseract.exe"
        path_to_image = r'D:\projects_python\workingon\isbnocr\isbn_recognition\ISBN 978-7-5099-1125-9.png'
        path_to_images = r'D:\projects_python\workingon\isbnocr\isbn_recognition\images'
        # digit_recog(path_to_image)  # 识别单张图片
        digit_recog_batch(path_to_images)  # 批量识别图片中的ISBN编号
    
    

    这个项目,坐下来感觉比较有参考价值的还是图片预处理的思路和相关实现,至于字符识别,由于时间关系没来得及写,就用了比较粗陋的方法实现,后面有时间改成模板匹配或者神经网络识别字符。

    图片的源代码和数据集都同步到gitee上了,地址在这里isbn-ocr: 计算机视觉课程设计 识别ISBN中的数字 (gitee.com)

  • 相关阅读:
    MongoDB学习:(一)MongoDB安装
    事件轮询 Event Loop
    常见的HTML5语义化标签
    前端动画性能优化方案
    前端动画的实现
    《SVN的操作流程及规范》
    css、js文件后的后缀作用是什么?
    实现单行文字溢出显示...,以及多行文字溢出显示...
    从输入URL到页面返回的过程详解
    jQuery实现点击复制效果
  • 原文地址:https://www.cnblogs.com/ericling/p/15589649.html
Copyright © 2020-2023  润新知