ISBN识别
学校三级项目需要批量识别ISBN中的数字
实现的大致思路如下:
对原始图片按尺寸自动调整大小,高斯滤波去噪,灰度化,二值化,边缘检测后闭操作,查找最大轮廓,获取最小外接矩形及旋转角度,旋转摆正图片,水平投影,提取字符区域,用pytesseract
识别字符
项目代码存放在三个文件中
1.工具类ocr_tool.py
import re
from difflib import SequenceMatcher
# 提取字符串中的数字
def obtain_digit(data):
s = re.findall(r"\d+", data)
return ''.join(s)
# 统计正确识别的数字个数
def crct_digit_cnt(crct_isbn, recog_isbn):
return SequenceMatcher(None, crct_isbn, recog_isbn).find_longest_match(0, len(crct_isbn), 0, len(recog_isbn)).size
2.图片预处理相关函数img_process_tool.py
from math import fabs, sin, radians, cos
import cv2 as cv
import numpy as np
def img_show_wait(img, window_name, duration=100):
"""
显示图片
:param img:输入图片
:param window_name: 显示图片的窗口名称
:param duration: 显示图片的时长,默认等待键入任意按键,不自动关闭窗口
"""
cv.imshow(window_name, img)
cv.waitKey(duration)
def get_projection_list(binary_img, direction='horizontal'):
"""
获取指定方向的投影
:param binary_img: 输入的二值图
:param direction: 投影方向
:return: 投影方向上的像素统计图
"""
h, w = binary_img.shape[:2]
row_list = [0] * h
col_list = [0] * w
for row in range(h):
for col in range(w):
if binary_img[row, col] == 255: # 统计白色像素点个数
row_list[row] = row_list[row] + 1
col_list[col] = col_list[col] + 1
if direction == 'horizontal':
return row_list
else:
return col_list
def draw_projection(data_list, rows, cols, direction='horizontal'):
"""
绘制指定方向上的投影图并返回
:param data_list: 用于绘制投影的数据
:param rows: 原始图像的行数
:param cols: 原始图像的列数
:param direction: 指定水平或者垂直方向投影
:return: 投影图像
"""
img_proj = np.ones(shape=(rows, cols), dtype=np.uint8) * 255
row_max = np.max(data_list)
if direction == 'horizontal':
# 绘制水平投影图
weight = cols / row_max
for row in range(rows):
pt1 = (0, row)
pt2 = (int(weight * data_list[row]), row)
cv.line(img_proj, pt1, pt2, (0,), 1)
img_show_wait(img_proj, 'horizontal projection')
else:
# 绘制垂直投影图
weight = rows / row_max
for col in range(cols):
pt1 = (col, rows - 1)
pt2 = (col, rows - 1 - int(weight * data_list[col]))
cv.line(img_proj, pt1, pt2, (0,), 1)
img_show_wait(img_proj, 'vertical projection')
return img_proj
def split_projection_list(proj_list: list, min_val=0):
"""
将投影得到的像素统计区间分割出像素集中区域,返回ROI区域的坐标区间
:param proj_list: 投影统计数据
:param min_val: 用于划定区间的一个阈值
:return: ROI区域的坐标区间
"""
start = 0
end = None
split_list = []
for idx, value in enumerate(proj_list):
if value > min_val:
end = idx
else:
if end is not None:
split_list.append((start, end))
end = None
start = idx
return split_list
def img_rotate(img, degree):
"""
对图片进行旋转
:param img:输入图片
:param degree: 旋转角度
:return:
"""
height, width = img.shape[:2]
heightNew = int(width * fabs(sin(radians(degree))) + height * fabs(cos(radians(degree)))) # 扩充画布
widthNew = int(height * fabs(sin(radians(degree))) + width * fabs(cos(radians(degree)))) # 扩充画布
matRotation = cv.getRotationMatrix2D((width // 2, height // 2), degree, 1) # 获取旋转矩阵
matRotation[0, 2] += (widthNew - width) // 2 # 旋转后平移
matRotation[1, 2] += (heightNew - height) // 2 # 旋转后平移
imgRotation = cv.warpAffine(img, matRotation, (widthNew, heightNew), borderValue=(255, 255, 255)) # 获取旋转后的图片
return imgRotation, matRotation
def draw_box(img, box):
"""
在指定图片上画矩形框
:param img: 输入图片
:param box: 矩形框的坐标,一个长度为8的坐标序列
:return:
"""
cv.line(img, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 3)
cv.line(img, (box[2], box[3]), (box[4], box[5]), (0, 255, 0), 3)
cv.line(img, (box[0], box[1]), (box[6], box[7]), (0, 255, 0), 3)
cv.line(img, (box[4], box[5]), (box[6], box[7]), (0, 255, 0), 3)
return img
def img_resize(img_original):
"""
根据给定图片的尺寸自动调整为相应的大小
:param img_original: 输入图片
:return:
"""
rows, cols = img_original.shape[:2]
if rows > 1300:
img_original = cv.resize(img_original, None, fx=0.25, fy=0.25, interpolation=cv.INTER_CUBIC) # 调整大小
elif 750 < rows <= 1300:
img_original = cv.resize(img_original, None, fx=0.5, fy=0.5, interpolation=cv.INTER_CUBIC) # 调整大小
elif 500 < rows <= 750:
img_original = cv.resize(img_original, None, fx=0.75, fy=0.75, interpolation=cv.INTER_CUBIC) # 调整大小
return img_original
def adaptive_threshold(gray, block_size=5, c=10, inv=False):
"""
对给定图片进行二值化处理
:param gray: 输入的灰度图
:param block_size: 卷积核大小
:param c:
:param inv: 是否反转,默认情况下,不反转,字符区域显示为白色,不相关区域显示为黑色
:return: 返回二值化后的图像
"""
if not inv:
thresholdType = cv.THRESH_BINARY
else:
thresholdType = cv.THRESH_BINARY_INV
# 自适应阈值化能够根据图像不同区域亮度分布,改变阈值
binary_img = cv.adaptiveThreshold(
gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, thresholdType, block_size, c)
return binary_img
def img_preprocess(img, kernel=None):
"""
对图片进行预处理,包括高斯滤波去噪,转为灰度图,自适应二值化处理,闭运算处理
:param img: 输入图片
:param kernel: 卷积核
:return: 预处理后得到的图像
"""
img_blur = cv.GaussianBlur(img, (3, 3), 0) # 高斯滤波进行去噪
img_gray = cv.cvtColor(img_blur, cv.COLOR_BGR2GRAY) # 转为灰度图
ret, img_bin = cv.threshold(img_gray, 0, 255, cv.THRESH_BINARY_INV + cv.THRESH_OTSU) # 自适应二值化
img_canny = cv.Canny(img_bin, 50, 50) # 边缘检测
if kernel is None:
kernel = cv.getStructuringElement(cv.MORPH_RECT, (5, 5), (-1, -1))
img_close = cv.morphologyEx(img_canny, cv.MORPH_CLOSE, kernel) # 先膨胀后腐蚀,减少连通区域(闭运算)
img_show_wait(img_close, 'original img close')
return img_close
if __name__ == '__main__':
# 测试图片切割序列
path_to_img = r'D:\projects_python\workingon\isbnocr\pageocr\xxx.png'
img_original = cv.imread(path_to_img)
img_pre = img_preprocess(img_original)
horizontal_projection_list = get_projection_list(img_pre, 'horizontal')
rows, cols = img_pre.shape[:2]
draw_projection(horizontal_projection_list, rows, cols, 'horizontal')
3.核心代码isbnocr.py
import os
from os import listdir
import pytesseract
from img_process_tool import *
from ocr_tool import *
def img_isbn_area(img_original):
"""
对原始图片倾斜摆正后提取出原始图片中的字符区域
:param img_original: 输入图片
:return: 提取出的字符区域
"""
kernel = cv.getStructuringElement(cv.MORPH_RECT, (100, 5), (-1, -1))
img_pre = img_preprocess(img_original, kernel)
# 查找最大轮廓
contours, hierarchy, = cv.findContours(img_pre, 1, 2)
length = len(contours)
index = 0 # 存放最大轮廓的索引
max_area = cv.contourArea(contours[index]) # 存放最大轮廓的面积
for i in range(length):
cnt = contours[i]
area = cv.contourArea(cnt)
if area > max_area:
index = i
max_area = area
# 利用最大轮廓计算倾斜角度,将图片摆正
cnt = contours[index]
min_area_rect = cv.minAreaRect(cnt) # 获取最小外接矩形,返回一个rect,ndarray类型
angle = min_area_rect[2] # 获取旋转角度
if angle > 80:
angle = min_area_rect[2] - 90 # 调整旋转角度
img_rotated, mat_rotation = img_rotate(img_original, angle) # 开始旋转
img_rotated_pre = img_preprocess(img_rotated, kernel)
# 水平投影
proj_list = get_projection_list(img_rotated_pre)
split_list = split_projection_list(proj_list, 0)
img_h, img_w = img_rotated_pre.shape[:2]
x, y, w, h = 0, 0, img_w, img_h
for start, end in split_list:
if end - start > img_h * 0.5:
continue
x, y, w, h = 0, start, img_w, end - start
roi = img_rotated[y:y + h, x:x + w]
img_show_wait(roi, 'roi')
proj_list = get_projection_list(img_rotated_pre, 'vertical')
split_vertical_list = split_projection_list(proj_list, 0)
if len(split_vertical_list) < 12:
continue
return img_rotated[y:y + h, x:x + w]
def split_digits(img_text):
"""
对提取出的字符区域进行垂直投影,切割出单个字符区域
:param img_text: 输入图像
:return: 单个字符区域
"""
isbn_gray = cv.cvtColor(img_text, cv.COLOR_BGR2GRAY)
ret, isbn_bin = cv.threshold(isbn_gray, 0, 255, cv.THRESH_OTSU + cv.THRESH_BINARY_INV)
img_show_wait(isbn_bin, 'isbn area bin')
ver_proj_list = get_projection_list(isbn_bin, 'vertical')
rows, cols = img_text.shape[:2]
draw_projection(ver_proj_list, rows, cols, 'vertical')
digit_col_list = split_projection_list(ver_proj_list)
digits = []
for i in range(len(digit_col_list)):
digit_col = digit_col_list[i]
digits.append(img_text[:, digit_col[0]:digit_col[1]])
return digits
def digit_recog(path_to_image):
"""
识别单张图片中的ISBN编号
:param path_to_image: 图片绝对路径
:return:
"""
img_original = cv.imread(path_to_image)
img_resized = img_resize(img_original)
isbn_area = img_isbn_area(img_resized)
recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
print(f'【图片{os.path.basename(path_to_image)}】的识别结果为:{recog_isbn}')
if isbn_area is not None:
digits = split_digits(isbn_area)
if digits is not None:
for i in range(len(digits)):
img_show_wait(digits[i], 'digit')
def digit_recog_batch(path_to_images):
"""
批量识别图片
:param path_to_images: 图片存放的文件夹绝对路径
:return:
"""
img_cnt = 0 # 图片数量
digit_cnt = 0 # 数字数量
digit_recognized = 0 # 正确识别数字
isbn_recognized = 0 # 正确识别完整ISBN
for file in listdir(path_to_images):
img_abs_path = os.path.join(path_to_images, file) # 图像的绝对路径
img_basename = os.path.basename(img_abs_path) # 图片的文件名
original_image = cv.imread(img_abs_path, cv.IMREAD_COLOR)
isbn_area = img_isbn_area(img_resize(original_image)) # 获取图片中的字符区域
if isbn_area is not None:
recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
print(f'【图片{img_basename}】的识别结果为:{recog_isbn}')
if recog_isbn:
img_cnt += 1
digit_cnt += len(obtain_digit(img_basename))
digit_recognized += crct_digit_cnt(obtain_digit(img_basename), recog_isbn)
isbn_recognized += 1 if recog_isbn.find(obtain_digit(file)) != -1 else 0
print("正确识别的ISBN个数:" + str(isbn_recognized) + "/" + str(img_cnt))
print("正确识别的数字个数:" + str(digit_recognized) + "/" + str(digit_cnt))
print("识别正确率:" + str(isbn_recognized / img_cnt))
print("识别准确率:" + str(digit_recognized / digit_cnt))
# todo:字符识别可以用神经网络或者模板匹配
if __name__ == "__main__":
pytesseract.pytesseract.tesseract_cmd = r"D:\software\Tesseract-OCR\tesseract.exe"
path_to_image = r'D:\projects_python\workingon\isbnocr\isbn_recognition\ISBN 978-7-5099-1125-9.png'
path_to_images = r'D:\projects_python\workingon\isbnocr\isbn_recognition\images'
# digit_recog(path_to_image) # 识别单张图片
digit_recog_batch(path_to_images) # 批量识别图片中的ISBN编号
这个项目,坐下来感觉比较有参考价值的还是图片预处理的思路和相关实现,至于字符识别,由于时间关系没来得及写,就用了比较粗陋的方法实现,后面有时间改成模板匹配或者神经网络识别字符。
图片的源代码和数据集都同步到gitee上了,地址在这里isbn-ocr: 计算机视觉课程设计 识别ISBN中的数字 (gitee.com)