Tesseract
1 ============================================================ 2 Tesseract 图片识别 3 ============================================================ 4 5 6 看门见山1:Tesseract的安装 7 --------------------------------------------------------------------------------------------------------------------------------- 8 | shell中安装Tesseract 9 | ------------------------------------------------------------------------------------ 10 | centos : yum -y install tesseract 11 | ubuntu : apt-get install tesseract-orc 命令行中运行tesseract指令即可 12 | 13 | python中安装 14 | --------------------------------------------------------- 15 | pip install pytesseract 16 | 17 --------------------------------------------------------------------------------------------------------------------------------- 18 19 20 看门见山2:使用Tesseract的安装 21 --------------------------------------------------------------------------------------------------------------------------------- 22 | 23 | shell中使用tesseract 24 | --------------------------------------------------------------------------------------- 25 | tesseract test.png text 第一步:使用tesseract命令将图片的内容经过光学文字识别(OCR)为文本并保存到text.txt文件中 26 | cat text.txt 第二步:打开文件查看识别结果 27 | 28 | 29 | python中使用tesseract 30 | ------------------------------------------------------------------------------------------ 31 | pip install pytesseract 第一步:安装pytesseract模块 32 | 33 | import pytesseract 第二步:导入pytesseract模块 34 | 35 | from PIL import Image 第二步:导入PIL库中的Image类(PIL即pillow) 36 | 37 | pic = Image.open(test.png) 第四步:调用Image类的open()方法,打开一张图片,创建一个图片对象操作实例 38 | 39 | text = pytesseract.image_to_string(pic) 第五步:调用pytesseract的image_to_string()方法,将上述图片实例识别为文本 40 | 41 | print text 42 | 43 --------------------------------------------------------------------------------------------------------------------------------- 44 45 46 附加:处理模糊图片识别问题 47 --------------------------------------------------------------------------------------------------------------------------------- 48 | 需要利用pillow库来创建一个阈值过滤器去掉图片中渐变的背景色,只把图片中的文本部分留下来,从而让图片更加清晰 49 | 50 | from PIL import Image 51 | import subprocess 52 | 53 | def clean_img(in_image,out_image): # 定义一个清理图片的方法clean_img 54 | pic = Image.open(in_image) # 打开一张待清理的图片 55 | pic = pic.point(lambda x:0 if x<143 else 255) # 对图片进行阈值过滤 56 | pic.save(out_image) # 将过滤后的图片另外保存 57 | subprocess.call([\'tesseract\',out_image,\'text\']) # 调用系统的tesseract命令对out_image图片进行识别,将文本保存到text.txt文件 <--- 这个subprocess模块貌似很有用哦!!!! 58 | f = open(\'text.txt\',\'r\') 59 | print f.read() # 以只读模式代开查看文件内容(被识别的文字) 60 | f.close() 61 | 62 ---------------------------------------------------------------------------------------------------------------------------------- 63 64 附加:训练Tesseract更好识别图片中文字(详见资料) 65 66 67 Tesseract图片识别爬虫案例1: 爬取并识别<<战争与和平>>书籍的介绍图片 68 ---------------------------------------------------------------------------------------------------------------------------- 69 | 70 | import time 71 | import subprocess 72 | from urllib import urlretrieve 73 | from selenium import webdriver 74 | 75 | driver = webdriver.PhantomJS() 76 | driver.get(\'http://www.amazon.com/war-peace\') 77 | 78 | import time 79 | from urllib.request import urlretrieve 80 | import subprocess 81 | from selenium import webdriver 82 | 83 | driver = webdriver.PhantomJS() #创建新的Selenium driver 84 | driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200") 85 | driver.find_element_by_id("sitbLogoImg").click() # 查找并模拟单击图书预览按钮元素 86 | imageList = set() # 设置保存图书图片地址的空集合 87 | time.sleep(5) # 等待图书预览页面加载完成 88 | 89 | while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"): # 循环等待"下一页"按钮元素出现,按钮元素出现就执行下面的模拟点击操作 90 | driver.find_element_by_id("sitbReaderRightPageTurner").click() # 找到"下一页"按钮并模拟点击翻页 91 | time.sleep(2) # 等待下一预览页的加载完成 92 | pages = driver.find_elements_by_xpath("//div[@class=\'pageImage\']/div/img") # Xpath匹配提取预览页中包含图片地址的标签,并返回标签列表 93 | for page in pages: 94 | image = page.get_attribute("src") # 遍历标签列表,从标签中取出图片连接,并添加到imagelist集合中 95 | imageList.add(image) 96 | driver.quit() 97 | 98 | for image in sorted(imageList): # for循环遍历图片链接的结合,逐一取出图片地址并下载保存到本地,存放为page.jpg 99 | urlretrieve(image, "page.jpg") 100 | p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE) # 调用系统的tesseract识别这些图片的内容,并存入page.txt 101 | f = open("page.txt", "r") 102 | p.wait() 103 | print(f.read()) # 以只读模式代开查看page.txt文件内容 104 | 105 ---------------------------------------------------------------------------------------------------------------------------- 106 107 108 Tesseract图片识别爬虫案例2: 手动查看知乎验证码图片实现登陆知乎 109 ---------------------------------------------------------------------------------------------------------------------------- 110 | 111 | #!/usr/bin/env python 112 | # -*- coding:utf-8 -*- 113 | 114 | from bs4 import BeautifulSoup 115 | import requests 116 | import time 117 | 118 | 119 | def zhihuLogin(): 120 | sess = requests.Session() # 构建一个Session对象,可以保存页面Cookie 121 | headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 122 | html = sess.get("https://www.zhihu.com/#signin", headers = headers).text # 首先获取登录页面,找到需要POST的数据(_xsrf),同时会记录当前网页的Cookie值 123 | bs = BeautifulSoup(html, "lxml") # 调用lxml解析库 124 | _xsrf = bs.find("input", attrs={"name":"_xsrf"}).get("value") # 找到name属性值为 _xsrf 的input标签,再取出value 的值 125 | captcha_url = "https://www.zhihu.com/captcha.gif?r=%d&type=login" % (time.time() * 1000) # 根据UNIX时间戳,匹配出验证码的URL地址 126 | captcha_data = sess.get(captcha_url, headers = headers).content # 发送图片的请求,获取图片数据流, 127 | with open("captcha.jpg", "wb") as f: # 将验证码图片保存到本地 128 | f.write(captcha_data) 129 | 130 | data = { 131 | "_xsrf" : _xsrf, 132 | "account" :raw_input("请输入知乎用户名:"), 133 | "password" :raw_input("请输入知乎密码:"), 134 | "captcha" :raw_input("请查看验证码图片并输入验证码:") # 需要自行查看存放到本地验证码图片里的文字,需要手动输入 135 | } 136 | 137 | response = sess.post("https://www.zhihu.com/login/email", data = data, headers = headers) # 发送验证用户登录需要的POST数据,获取成功登录后的Cookie(保存在sess里) 138 | response = sess.get("https://www.zhihu.com/people/xiao-jing-qian-60/activities", headers = headers) # 用已有成功登录状态的Cookie发送请求,获取好友页面源码 139 | print response.text 140 | 141 | if __name__ == "__main__": 142 | zhihuLogin() 143 | 144 ---------------------------------------------------------------------------------------------------------------------------- 145 146 147 Tesseract图片识别爬虫案例3: Tesseract自动识别验证码图片实现登陆知乎 148 ---------------------------------------------------------------------------------------------------------------------------- 149 | 150 | #!/usr/bin/env python 151 | # -*- coding:utf-8 -*- 152 | 153 | import sys 154 | reload(sys) 155 | sys.setdefaultencoding(\'utf-8\') 156 | 157 | from bs4 import BeautifulSoup 158 | import requests 159 | import time 160 | from pytesseract import * 161 | from PIL import Image 162 | 163 | 164 | def zhihuLogin(): 165 | sess = requests.Session() # 构建一个Session对象,可以保存页面Cookie 166 | headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 167 | html = sess.get("https://www.zhihu.com/#signin", headers = headers).text # 首先获取登录页面,找到需要POST的数据(_xsrf),同时会记录当前网页的Cookie值 168 | bs = BeautifulSoup(html, "lxml") # 调用lxml解析库 169 | _xsrf = bs.find("input", attrs={"name":"_xsrf"}).get("value") # 找到name属性值为 _xsrf 的input标签,再取出value 的值 170 | captcha_url = "https://www.zhihu.com/captcha.gif?r=%d&type=login" % (time.time() * 1000) # 根据UNIX时间戳,匹配出验证码的URL地址 171 | captcha_data = sess.get(captcha_url, headers = headers).content # 发送图片的请求,获取图片数据流, 172 | with open("captcha.jpg", "wb") as f: # 将验证码图片保存到本地 173 | f.write(captcha_data) 174 | time.sleep(1) # 防止图片写在本地的时候还没有写完 175 | image = Image.open("captcha.jpg") # 打开验证码图片 176 | text = image_to_string(image) # 将验证码图片中识别的文字赋值给text变量中 177 | print "机器识别后的验证码为:" + text # 打印Tesseract识别后的验证码文字 178 | command = raw_input("输入y表示验证正确,同意使用(输入其他按键则自行输入验证码:)") 179 | if (command == "Y" or command == "y"): 180 | text = text 181 | else: 182 | text = raw_input("请手动输入验证码:") # 用户查看本地验证码图片中的文字是否与Tesseract识别的文字一样,从而输入y/n 183 | 184 | data = { 185 | "_xsrf" : _xsrf, 186 | "email" : raw_input("请输入知乎用户名:"), 187 | "password" : raw_input("请输入用户密码:"), 188 | "captcha" : text 189 | } 190 | 191 | response = sess.post("https://www.zhihu.com/login/email", data = data, headers = headers) # 发送验证用户登录需要的POST数据,获取成功登录后的Cookie(保存在sess里) 192 | response = sess.get("https://www.zhihu.com/people/xiao-jing-qian-60/activities", headers = headers) # 用已有成功登录状态的Cookie发送请求,获取好友页面源码 193 | print response.text 194 | 195 | if __name__ == "__main__": 196 | zhihuLogin() 197 | 198 ----------------------------------------------------------------------------------------------------------------------------