ocr 文字区域检测及识别

# coding=utf-8

from PIL import Image, ImageFilter, ImageEnhance
from skimage.filters import threshold_otsu
import skimage.morphology as sm
from skimage.measure import regionprops
import matplotlib.pyplot as plt
import numpy as np
import pytesseract
import re
import os
import time
import logging
logging.basicConfig(level=logging.INFO, format="%(message)s", filename=\'train_output.log\')

class ShopCert(object):
    def cut_region(self, img):
        """
        先按规则缩小搜索范围
        """
        w, h = img.size
        if h<1200:
            factor = max(1, 1600.0/h)
            newsize = int(w*factor), int(h*factor)
            img = img.resize(newsize, Image.ANTIALIAS)
        if w<h:
            box = (w*0.4, h*0.18, w*0.96, h*0.6)
        else:
            box = (w*0.1, h*0.18, w*0.96, h*0.9)
        return img.crop(box)

    
    def detect_text(self, img):
        """
        检测字符区域
        """
        imgM = np.array(img.convert(\'L\'))
        imgM = 1 * (imgM < threshold_otsu(imgM)) 
        imgM = sm.binary_closing(imgM, np.ones((5, 10)))
        imgM = sm.remove_small_objects(imgM, 600)
        label_img = sm.label(imgM)
        imgList = []
        for region in regionprops(label_img):
            minr, minc, maxr, maxc = region.bbox
            w, h = (maxc-minc), (maxr-minr)
            if h > w * 0.2:
                continue
            box = minc-5, minr-3, maxc+5, maxr+3
            imgList.append(img.crop(box))
        return imgList

    
    
    def clear_noise(self, box):
        """
        降噪处理
        """
        box = box.convert(\'L\')
 #       box = box.point(lambda x: 0 if x<50 else x)
        box = box.point(lambda x: 200 if x>200 else x)
        box = ImageEnhance.Contrast(box).enhance(2.5)
        return box
    
    def predict(self, fname, lang=\'eng\'):
        """
        ocr 识别
        """
        img = Image.open(fname)
        # 先大致缩小范围
        region = self.cut_region(img)
        # 候选字符区域
#        region = self.clear_noise(region)
        boxList = self.detect_text(region)
        # 遍历识别
        for box in boxList:
            box = self.clear_noise(box)
            w, h = box.size
            if float(w)/h > 12.5:
                res = pytesseract.image_to_string(box, lang=\'chi_sim\', config=\'-psm 7\')
            else:
                res = pytesseract.image_to_string(box, lang=\'eng\', config=\'-psm 7\')
            res = re.sub(\'\s\', \'\', res)  # 去除中间空白
            res = re.findall(r\'[0-9][A-Z0-9]{13,20}\', res)  # 13-20位
            for line in res:
                line = line.strip()
                if line.find(u\'年\')>1:
                    continue
                print \'line\', line
                if len(line)> 14:
                    box.save(\'img/clearNoise/%s_%s.jpg\' % (fname.split(\'/\')[-1].split(\'.\')[0], line))
                    return line
                else:
                    print \'error line\', line
        return \'error\'


def show_pic(path=\'img/origin2/\'):
    fnames = [os.path.join(path, fname) for fname in os.listdir(path)]
    for i, fname in enumerate(fnames, 0):
        print fname
        img = Image.open(fname)
      #  img.save(\'./tesseract-train/cert.normal.exp%d.ttf\' % i)
        img = ImageEnhance.Contrast(img).enhance(2.0)
        img = img.filter(ImageFilter.MedianFilter).convert(\'L\')
        plt.figure(figsize=(10, 12), dpi=300)
        plt.imshow(img, plt.cm.gray)
        plt.title(fname.split(\'/\')[-1]+\'_%d\' % i)
        plt.show()if __name__ == \'__main__\':
    test = ShopCert()
    path = \'img/origin2/\'
    fnames = [os.path.join(path, fname) for fname in os.listdir(path) if fname.endswith(\'jpg\')]
    fnames.sort()
    
    arguments = \'mode: L; enhance:2.0; h:0.5; dh:0.15\'
    logging.info(\'%s\' % arguments)
    logging.info("%s: %s" % (\'imgname\', \'result\'))
    start_time = time.time()
    cnt = 0
    for idx, fname in enumerate(fnames, 1):
        print idx, fname
        y_true = fname.split(\'/\')[-1].split(\'_\')[0]
        y_pred = test.predict(fname)
        if y_true == y_pred:
            cnt +=1
            print fname
        else: 
            print \'***\'*20
            print \'error\'
        logging.info("%s: %s" % (fname, y_pred))
        print \'y_true\', y_true
        print \'y_pred\', y_pred
        acc =  float(cnt)/idx
        print acc, cnt
        print \'==\'*20, idx
        logging.info(\'%.3f %d/%d\' % (acc, cnt, idx))
    print \'cost time: \', time.time()-start_time
    logging.info(\'accuracy: %.2f\' % acc)