python wand image純圖片PDF轉圖片後呼叫百度OCR
阿新 • • 發佈:2018-12-12
一、安裝庫
- pip install wand
- yum update
- yum -y install ImageMagick-devel 我自己是在docker裡面安裝測試的,沒問題
二、開整 大體思路:
- 使用wand處理純圖片的PDF檔案(轉成圖片)
- 使用Image庫處理PDF生成的圖片
- 圖片傳給百度OCR識別
貼程式碼吧
# coding=utf-8
from __future__ import division
import StringIO
import math
from wand.image import Image
# 這裡我起了個別名
from PIL import Image as PImage
# 百度OCR最大長度
bai_du_ocr_max = 4096
#主要方法
def convert(file_name, target_width=1500):
try:
with Image(filename=file_name) as img:
image_page_num = len(img.sequence)
# PDF裡面只有一張圖片
if image_page_num == 1:
# 獲取最終圖片寬高
target_width, target_height = _get_one_info(target_width, img.width, img.height)
# 縮放,文件上說比resize速度快
img.sample(target_width, target_height)
# 如果最終高度大於百度最大高度,則crop
if target_height > bai_du_ocr_max:
img.crop(0, 0, target_width, bai_du_ocr_max)
# img.save(filename='%s.jpg' % (str(int(time.time())) + '_' + str(img.width)))
result = img.make_blob('jpg')
# 下面是準備二值化,發現總體速度還不如直接傳給百度
# paste_image = PImage.open(StringIO.StringIO(img.make_blob('jpg')))
# paste_image = paste_image.convert("L")
# paste_image.show()
# d = StringIO.StringIO()
# paste_image.save(d, 'JPEG')
# result = d.getvalue()
# PDF裡面有一張以上圖片
else:
# 多張時,獲取最終寬高、拼接頁數
target_width, target_height, page_num = _get_more_info(
target_width, img.width, img.height, image_page_num
)
# 生成貼上的背景圖 (測試多次,發現L比RGB快)
paste_image = PImage.new('L', (target_width, target_height))
# 拼接圖片
for i in range(0, page_num):
image = Image(image=img.sequence[i])
# 計算一張圖的高度
one_img_height = int(target_height / page_num)
# 縮放
image.sample(target_width, one_img_height)
# 將wand庫檔案轉成PIL庫檔案
pasted_image = PImage.open(StringIO.StringIO(image.make_blob('jpg')))
# 將圖片貼上到背景圖
paste_image.paste(pasted_image, (0, one_img_height * i))
# 如果最終高度大於百度最大高度,則crop
if target_height > bai_du_ocr_max:
paste_image = paste_image.crop((0, 0, target_width, bai_du_ocr_max))
# 從記憶體中讀取檔案
d = StringIO.StringIO()
# 這裡是JPEG不是JPG
paste_image.save(d, 'JPEG')
result = d.getvalue()
# paste_image.save('%s.jpg' % (str(int(time.time())) + '_' + str(img.width)))
# 測試的時候可以開啟
# paste_image.show()
except Exception as e:
result = False
return result
# 一張時獲取寬高,如果圖片寬頻大於我們想要的寬度,則等比縮放圖片高度
def _get_one_info(target_width, img_width, img_height):
if img_width > target_width:
ratio = target_width / img_width
target_height = int(ratio * img_height)
else:
target_width = img_width
target_height = img_height
return target_width, target_height
# 多張時獲取寬高和拼接頁數
def _get_more_info(target_width, img_width, img_height, image_page_num):
one_width, one_height = _get_one_info(target_width, img_width, img_height)
if one_height < bai_du_ocr_max:
# 百度最大高度除以每張圖高度,向上取整,即拼接圖片的數量
num = int(math.ceil(bai_du_ocr_max / one_height))
# 取拼接數和總頁數的最小值
page_num = min(num, image_page_num)
return one_width, one_height * page_num, page_num
else:
return one_width, one_height, 1 # 1頁
# 除錯時候用
def _ocr(content):
url = '百度OCR連結(自己去百度OCR官網申請就行)'
img = base64.b64encode(content)
params = {"image": img}
params = urllib.urlencode(params)
request = urllib2.Request(url, params)
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
response = urllib2.urlopen(request)
content = response.read()
# print content
dict_content = json.loads(content)
text = "\n".join(map(lambda x: x["words"], dict_content["words_result"]))
return text
# 除錯時候用
def _write_file(path, data, type="w"):
try:
f = open(path, '%sb' % type)
except:
f = open(path.encode("utf-8"), '%sb' % type)
f.write(data)
f.close()
# 除錯時候用
if __name__ == '__main__':
import sys
import base64
import json
import urllib
import urllib2
import time
start = time.time()
source_file = sys.argv[1]
ret = convert(source_file, 1500)
end = time.time()
# 這裡我統一儲存下檔案,方便開啟觀察
_write_file(str(end) + '.jpg', ret)
if ret:
text = _ocr(ret)
end_parse = time.time()
print '____________________________________________'
print end - start
print end_parse - end
print '+++++++++++++++++++++++++++++++++++++++++++++'
print text
呼叫方法(python檔名字和檔案路徑自己替換)
python pdf2img.py file_path
PS: 我這裡一張和多張時分開的,我發現我的PDF檔案大多數都是一頁的(需求方),極少數是多頁。另外,1頁的時候只用wand一個庫就行,不需要貼上到大圖上,會省去一些時間。有些問題沒做就是,併發高的時候,伺服器記憶體可能扛不住,畢竟圖片都是在記憶體裡面。