emperorking

java文字识别技术

Java文字识别程序的关键是寻找一个可以调用的OCR引擎。tesseract-ocr就是一个这样的OCR引擎,在1985年到1995年由HP实验室开发,现在在Google。tesseract-ocr 3.0发布,支持中文。不过tesseract-ocr 3.0不是图形化界面的客户端,别人写的FreeOCR图形化客户端还不支持导入新的 3.0 traineddata。但这标志着,现在有自由的中文OCR软件了。

    java中使用tesseract-ocr3.01的步骤如下:

1.下载安装tesseract-ocr-setup-3.01-1.exe(3.0以上版本才增加了中文识别)

2.在安装向导中可以选择需要下载的语言包。

3.到网上搜索下载java图形处理所需的2个包:jai_imageio-1.1-alpha.jar,swingx-1.6.1.jar

4.java程序清单:

ImageIOHelper 类:

 1     import java.awt.image.BufferedImage;  
 2     import java.io.File;  
 3     import java.io.IOException;  
 4     import java.util.Iterator;  
 5     import java.util.Locale;  
 6       
 7     import javax.imageio.IIOImage;  
 8     import javax.imageio.ImageIO;  
 9     import javax.imageio.ImageReader;  
10     import javax.imageio.ImageWriteParam;  
11     import javax.imageio.ImageWriter;  
12     import javax.imageio.metadata.IIOMetadata;  
13     import javax.imageio.stream.ImageInputStream;  
14     import javax.imageio.stream.ImageOutputStream;  
15       
16     import com.sun.media.imageio.plugins.tiff.TIFFImageWriteParam;  
17       
18     public class ImageIOHelper {    
19             
20         public static File createImage(File imageFile, String imageFormat) {    
21             File tempFile = null;    
22             try {    
23                 Iterator readers = ImageIO.getImageReadersByFormatName(imageFormat);    
24                 ImageReader reader = (ImageReader) readers.next();25                 
26                 ImageInputStream iis = ImageIO.createImageInputStream(imageFile);    
27                 reader.setInput(iis);    
28                 //Read the stream metadata    
29                 IIOMetadata streamMetadata = reader.getStreamMetadata();    
30                     
31                 //Set up the writeParam    
32                 TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.CHINESE);    
33                 tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED);    
34                     
35                 //Get tif writer and set output to file    
36                 Iterator writers = ImageIO.getImageWritersByFormatName("tiff");    
37                 ImageWriter writer = writers.next();    
38                     
39                 BufferedImage bi = reader.read(0);    
40                 IIOImage image = new IIOImage(bi,null,reader.getImageMetadata(0));    
41                 tempFile = tempImageFile(imageFile);    
42                 ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile);    
43                 writer.setOutput(ios);    
44                 writer.write(streamMetadata, image, tiffWriteParam);    
45                 ios.close();    
46                     
47                 writer.dispose();    
48                 reader.dispose();    
49                     
50             } catch (IOException e) {    
51                 e.printStackTrace();    
52             }    
53             return tempFile;    
54         }    
55         
56         private static File tempImageFile(File imageFile) {    
57             String path = imageFile.getPath();    
58             StringBuffer strB = new StringBuffer(path);    
59             strB.insert(path.lastIndexOf(\'.\'),0);    
60             return new File(strB.toString().replaceFirst("(?<=//.)(//w+)$", "tif"));    
61         }    
62         
63     }  

OCR 类:

 1 package com.hhp.util;  
 2   
 3 import java.io.BufferedReader;    
 4 import java.io.File;    
 5 import java.io.FileInputStream;    
 6 import java.io.InputStreamReader;    
 7 import java.util.ArrayList;    
 8 import java.util.List;    
 9 import org.jdesktop.swingx.util.OS;    
10     
11 public class OCR {    
12     private final String LANG_OPTION = "-l";  //英文字母小写l,并非数字1    
13     private final String EOL = System.getProperty("line.separator");    
14     private String tessPath = "C://Program Files (x86)//Tesseract-OCR";    
15     //private String tessPath = new File("tesseract").getAbsolutePath();    
16         
17     public String recognizeText(File imageFile,String imageFormat)throws Exception{    
18         File tempImage = ImageIOHelper.createImage(imageFile,imageFormat);    
19         File outputFile = new File(imageFile.getParentFile(),"output");    
20         StringBuffer strB = new StringBuffer();    
21         List cmd = new ArrayList();    
22         if(OS.isWindowsXP()){    
23             cmd.add(tessPath+"//tesseract");    
24         }else if(OS.isLinux()){    
25             cmd.add("tesseract");    
26         }else{    
27             cmd.add(tessPath+"//tesseract");    
28         }    
29         cmd.add("");    
30         cmd.add(outputFile.getName());    
31         cmd.add(LANG_OPTION);    
32         cmd.add("chi_sim");    
33         //cmd.add("eng");    
34             
35         ProcessBuilder pb = new ProcessBuilder();    
36         pb.directory(imageFile.getParentFile());    
37             
38         cmd.set(1, tempImage.getName());    
39         pb.command(cmd);    
40         pb.redirectErrorStream(true);    
41             
42         Process process = pb.start();    
43         //tesseract.exe 1.jpg 1 -l chi_sim    
44         int w = process.waitFor();    
45             
46         //删除临时正在工作文件    
47         tempImage.delete();    
48             
49         if(w==0){    
50             BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath()+".txt"),"UTF-8"));    
51                 
52             String str;    
53             while((str = in.readLine())!=null){    
54                 strB.append(str).append(EOL);    
55             }    
56             in.close();    
57         }else{    
58             String msg;    
59             switch(w){    
60                 case 1:    
61                     msg = "Errors accessing files.There may be spaces in your image\'s filename.";    
62                     break;    
63                 case 29:    
64                     msg = "Cannot recongnize the image or its selected region.";    
65                     break;    
66                 case 31:    
67                     msg = "Unsupported image format.";    
68                     break;    
69                 default:    
70                     msg = "Errors occurred.";    
71             }    
72             tempImage.delete();    
73             throw new RuntimeException(msg);    
74         }    
75         new File(outputFile.getAbsolutePath()+".txt").delete();    
76         return strB.toString();    
77     }    
78 } 

测试类TestOCR :

 1 import java.io.File;  
 2 import java.io.IOException;  
 3   
 4 import com.hhp.util.OCR;  
 5   
 6 public class OcrTest {  
 7   
 8  public static void main(String[] args) {  
 9         String path = "C://temp//OCRcode//4.png";       
10         System.out.println("ORC Test Begin......");  
11         try {       
12             String valCode = new OCR().recognizeText(new File(path), "png");       
13             System.out.println(valCode);       
14         } catch (IOException e) {       
15             e.printStackTrace();       
16         } catch (Exception e) {    
17             e.printStackTrace();    
18         }         
19         System.out.println("ORC Test End......");  
20     }    
21   
22 }

 

分类:

技术点:

相关文章: