tesseract工具需要设置两个环境变量:Path 和TESSDATA_PREFIX ,变量值都是tesseract的安装目录;
工具的安装和训练字库:https://www.cnblogs.com/zhongtang/p/5555950.html
将训练的字库合并:https://blog.csdn.net/woaipangruimao/article/details/78740270
设置好字符库后需要跟java程序结合:
1. String temp = "";
int count = 0;
do {
Request request = new Request();
request.Cookie = "ASP.NET_SessionId=1qestr2g3jvzcvq0yg22ymj1";
Status status = new Status();
Response rs = new Response();
String picUrl = "http://www.afgl.gov.cn/user/login.aspx?AuthCode1$codeText=c705888&___clientRandom=0.8445339654723959";
URL url = new URL(null);
url.url = picUrl;
temp = Download.downloadFile_local(url, request, rs, status,
"/www/spider.soufun.com/templet/anshan/");
System.out.println("Path:"+temp);
File file = new File("/www/spider.soufun.com/templet/anshan/"+(Run.date_dir == null ? "" : Run.date_dir)+temp);
try {
cleanImage(file, "/www/spider.soufun.com/templet/anshan/");//处理图片杂色
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
File imageFile = new File("/www/spider.soufun.com/templet/anshan/35765be6cd12a1ffbf782012f13db3ac.jpg");
Tesseract test = new Tesseract();
test.setDatapath("C:\Program Files (x86)\Tesseract-OCR\tessdata");//训练的字库地址
test.setLanguage("font");//训练的字库名称
String result = "";
try {
result = test.doOCR(imageFile);
result = result.replace(" ", "");
System.out.println(result);
} catch (TesseractException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
2.
public static BufferedImage cleanImage(File sfile, String destDir)
throws IOException {
File destF = new File(destDir);
if (!destF.exists()) {
destF.mkdirs();
}
BufferedImage bufferedImage = ImageIO.read(sfile);
int h = bufferedImage.getHeight();
int w = bufferedImage.getWidth();
// 灰度化
int[][] gray = new int[w][h];
for (int x = 0; x < w; x++) {
for (int y = 0; y < h; y++) {
int argb = bufferedImage.getRGB(x, y);
// 图像加亮(调整亮度识别率非常高)
int r = (int) (((argb >> 16) & 0xFF) * 1.1 + 30);
int g = (int) (((argb >> 8) & 0xFF) * 1.1 + 30);
int b = (int) (((argb >> 0) & 0xFF) * 1.1 + 30);
if (r >= 255) {
r = 255;
}
if (g >= 255) {
g = 255;
}
if (b >= 255) {
b = 255;
}
gray[x][y] = (int) Math
.pow((Math.pow(r, 2.2) * 0.2973 + Math.pow(g, 2.2)
* 0.6274 + Math.pow(b, 2.2) * 0.0753), 1 / 2.2);
}
}
// 二值化
int threshold = ostu(gray, w, h);
BufferedImage binaryBufferedImage = new BufferedImage(w, h,
BufferedImage.TYPE_BYTE_BINARY);
for (int x = 0; x < w; x++) {
for (int y = 0; y < h; y++) {
if (gray[x][y] > threshold) {
gray[x][y] |= 0x00FFFF;
} else {
gray[x][y] &= 0xFF0000;
}
binaryBufferedImage.setRGB(x, y, gray[x][y]);
}
}
// 矩阵打印
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
if (isBlack(binaryBufferedImage.getRGB(x, y))) {
System.out.print("*");
} else {
System.out.print(" ");
}
}
System.out.println();
}
ImageIO.write(binaryBufferedImage, "jpg", new File(destDir, sfile.getName().replace("Gif", "jpg")));
return binaryBufferedImage;
}