java+OpenCV3 +百度OCR(或tesseract) 識別表格資料

阿新 • • 發佈：2019-02-13

原理:先用opencv識別出表格按點拆分每個單元格圖片交給百度或tesseract識別

當然有錢的可以買百度的OCR表格識別。。

package com.test;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.json.JSONArray;
import org.json.JSONObject;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.core.MatOfPoint;
import org.opencv.core.MatOfPoint2f;
import org.opencv.core.Point;
import org.opencv.core.Rect;
import org.opencv.core.Scalar;
import org.opencv.core.Size;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;

import com.baidu.aip.ocr.AipOcr;

/**
* Servlet implementation class TutableRead
*/
@WebServlet("/TutableReadBaidu")
public class TutableReadBaidu extends HttpServlet {
private static final long serialVersionUID = 1L;

protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
doPost(request, response);
}
static{
System.load("F:/opencv/build/java/x86/opencv_java310.dll");
}

//註冊百度有設定APPID/AK/SK
public static final String APP_ID = "";
public static final String API_KEY = "";
public static final String SECRET_KEY = "";
/**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
long startTime=System.currentTimeMillis();
String basePath = request.getSession().getServletContext().getRealPath("/images/");
File dir=new File(basePath);
if(dir.isDirectory()){
for (File f : dir.listFiles()) {f.delete(); }
}
Mat src = Imgcodecs.imread( "C:/Users/lilin/Desktop/"+request.getParameter("name")+".png");
if(src.empty()){ System.out.println( "not found file" ); return; }
Mat gray = new Mat();
Mat erod = new Mat();
Mat blur = new Mat();
int src_height=src.cols(), src_width=src.rows();
//先轉為灰度 cvtColor(src, gray, COLOR_BGR2GRAY);
Imgproc.cvtColor(src, gray, Imgproc.COLOR_BGR2GRAY);

/**
* 腐蝕（黑色區域變大）
Mat element = getStructuringElement(MORPH_RECT, Size(erodeSize, erodeSize));
erode(gray, erod, element);
*/
int erodeSize = src_height / 200;
if (erodeSize % 2 == 0){ erodeSize++; }
Mat element = Imgproc.getStructuringElement(Imgproc.MORPH_RECT, new Size(erodeSize, erodeSize));
Imgproc.erode(gray, erod, element);

//高斯模糊化
int blurSize = src_height / 200;
if (blurSize % 2 == 0) {blurSize++; }
Imgproc.GaussianBlur(erod, blur, new Size(blurSize, blurSize), 0, 0);

//封裝的二值化 adaptiveThreshold(~gray, thresh, 255, CV_ADAPTIVE_THRESH_MEAN_C, CV_THRESH_BINARY, 15, -2);
Mat thresh = gray.clone();
Mat xx = new Mat();
Core.bitwise_not(gray,xx);//反色
Imgproc.adaptiveThreshold(xx, thresh, 255, Imgproc.ADAPTIVE_THRESH_MEAN_C, Imgproc.THRESH_BINARY, 15, -2);
/*
這部分的思想是將線條從橫縱的方向處理後抽取出來，再進行交叉，矩形的點，進而找到矩形區域的過程

*/
// Create the images that will use to extract the horizonta and vertical lines
//使用二值化後的影象來獲取表格橫縱的線
Mat horizontal = thresh.clone();
Mat vertical = thresh.clone();
//這個值越大，檢測到的直線越多
String parameter = request.getParameter("xian"); if(parameter==null||parameter.equals("") ){ parameter="20"; }
int scale = Integer.parseInt(parameter); // play with this variable in order to increase/decrease the amount of lines to be detected 使用這個變數來增加/減少待檢測的行數

// Specify size on horizontal axis 指定水平軸上的大小
int horizontalsize = horizontal.cols() / scale;
// Create structure element for extracting horizontal lines through morphology operations 建立通過形態學運算提取水平線的結構元素
// 為了獲取橫向的表格線，設定腐蝕和膨脹的操作區域為一個比較大的橫向直條
Mat horizontalStructure = Imgproc.getStructuringElement(Imgproc.MORPH_RECT,new Size(horizontalsize, 1));
// Apply morphology operations
// 先腐蝕再膨脹
// iterations 最後一個引數，迭代次數，越多，線越多。在頁面清晰的情況下1次即可。
Imgproc.erode(horizontal, horizontal, horizontalStructure,new Point(-1, -1),1 );
Imgproc.dilate(horizontal, horizontal, horizontalStructure,new Point(-1, -1),1);
// dilate(horizontal, horizontal, horizontalStructure, Point(-1, -1)); // expand horizontal lines

// Specify size on vertical axis 同上
int verticalsize = vertical.rows() / scale;
// Create structure element for extracting vertical lines through morphology operations
Mat verticalStructure = Imgproc.getStructuringElement(Imgproc.MORPH_RECT,new Size(1, verticalsize));
Imgproc.erode(vertical, vertical, verticalStructure,new Point(-1, -1),1);
Imgproc.dilate(vertical, vertical, verticalStructure, new Point(-1, -1),1);
/*
* 合併線條
* 將垂直線，水平線合併為一張圖
*/
Mat mask = new Mat();
Core.add(horizontal,vertical,mask);
/*
* 通過 bitwise_and 定位橫線、垂直線交匯的點
*/
Mat joints=new Mat();
Core.bitwise_and(horizontal, vertical, joints);
/*
* 通過 findContours 找輪廓
*
* 第一個引數，是輸入影象，影象的格式是8位單通道的影象，並且被解析為二值影象（即圖中的所有非零畫素之間都是相等的）。
* 第二個引數，是一個 MatOfPoint 陣列，在多數實際的操作中即是STL vectors的STL vector，這裡將使用找到的輪廓的列表進行填充（即，這將是一個contours的vector,其中contours[i]表示一個特定的輪廓，這樣，contours[i][j]將表示contour[i]的一個特定的端點）。
* 第三個引數，hierarchy，這個引數可以指定，也可以不指定。如果指定的話，輸出hierarchy，將會描述輸出輪廓樹的結構資訊。0號元素表示下一個輪廓（同一層級）；1號元素表示前一個輪廓（同一層級）；2號元素表示第一個子輪廓（下一層級）；3號元素表示父輪廓（上一層級）
* 第四個引數，輪廓的模式，將會告訴OpenCV你想用何種方式來對輪廓進行提取，有四個可選的值：
* CV_RETR_EXTERNAL （0）：表示只提取最外面的輪廓；
* CV_RETR_LIST （1）：表示提取所有輪廓並將其放入列表；
* CV_RETR_CCOMP （2）:表示提取所有輪廓並將組織成一個兩層結構，其中頂層輪廓是外部輪廓，第二層輪廓是“洞”的輪廓；
* CV_RETR_TREE （3）：表示提取所有輪廓並組織成輪廓巢狀的完整層級結構。
* 第五個引數，見識方法，即輪廓如何呈現的方法，有三種可選的方法：
* CV_CHAIN_APPROX_NONE （1）：將輪廓中的所有點的編碼轉換成點；
* CV_CHAIN_APPROX_SIMPLE （2）：壓縮水平、垂直和對角直線段，僅保留它們的端點；
* CV_CHAIN_APPROX_TC89_L1 （3）or CV_CHAIN_APPROX_TC89_KCOS（4）：應用Teh-Chin鏈近似演算法中的一種風格
* 第六個引數，偏移，可選，如果是定，那麼返回的輪廓中的所有點均作指定量的偏移
*/
List<MatOfPoint> contours = new ArrayList<MatOfPoint>();
Mat hierarchy = new Mat();
Imgproc.findContours(mask,contours,hierarchy, Imgproc.RETR_EXTERNAL, Imgproc.CHAIN_APPROX_SIMPLE,new Point(0,0));

List<MatOfPoint> contours_poly = contours;
Rect[] boundRect = new Rect[contours.size()];
List<Mat> tables = new ArrayList<Mat>();
//my
List<Rect> haveReacts = new ArrayList();
Map<String, Map<String, Map<String, Double>>> mappoint=new HashMap<String, Map<String, Map<String, Double>>>();
//迴圈所有找到的輪廓-點
for(int i=0 ; i< contours.size(); i++){
//每個表的點
MatOfPoint point = contours.get(i);
MatOfPoint contours_poly_point = contours_poly.get(i);
/*
* 獲取區域的面積
* 第一個引數，InputArray contour：輸入的點，一般是影象的輪廓點
* 第二個引數，bool oriented = false:表示某一個方向上輪廓的的面積值，順時針或者逆時針，一般選擇預設false
*/
double area = Imgproc.contourArea(contours.get(i));
//如果小於某個值就忽略，代表是雜線不是表格
if(area < 100){ continue; }
/*
* approxPolyDP 函式用來逼近區域成為一個形狀，true值表示產生的區域為閉合區域。比如一個帶點幅度的曲線，變成折線
*
* MatOfPoint2f curve：畫素點的陣列資料。
* MatOfPoint2f approxCurve：輸出畫素點轉換後陣列資料。
* double epsilon：判斷點到相對應的line segment 的距離的閾值。（距離大於此閾值則捨棄，小於此閾值則保留，epsilon越小，折線的形狀越“接近”曲線。）
* bool closed：曲線是否閉合的標誌位。
*/
Imgproc.approxPolyDP(new MatOfPoint2f(point.toArray()),new MatOfPoint2f(contours_poly_point.toArray()),3,true);
//為將這片區域轉化為矩形，此矩形包含輸入的形狀
boundRect[i] = Imgproc.boundingRect(contours_poly.get(i));
// 找到交匯處的的表區域物件
Mat table_image = joints.submat(boundRect[i]);

List<MatOfPoint> table_contours = new ArrayList<MatOfPoint>();
Mat joint_mat = new Mat();
Imgproc.findContours(table_image, table_contours,joint_mat, Imgproc.RETR_CCOMP, Imgproc.CHAIN_APPROX_SIMPLE);
//從表格的特性看，如果這片區域的點數小於4，那就代表沒有一個完整的表格，忽略掉
if (table_contours.size() < 4){ continue; }

//表格裡面的每個點
Map<String, Double> x_zhis=new HashMap<String, Double>();
Map<String, Double> y_zhis=new HashMap<String, Double>();
for (MatOfPoint matOfPoint : table_contours) {
Point[] array = matOfPoint.toArray();
for (Point point2 : array) { x_zhis.put("x"+point2.x, point2.x); y_zhis.put("y"+point2.y, point2.y); }
}
//System.out.println( boundRect[i].x+"|"+boundRect[i].y+"|"+boundRect[i].width+"|"+boundRect[i].height+"|"+table_contours.size()+">>>>>>>>>>>>>>>>>>>");
//my add
haveReacts.add( boundRect[i]);
Map<String, Map<String, Double>> x =new HashMap<String, Map<String,Double>>(); x.put("x", x_zhis);x.put("y", y_zhis);
mappoint.put("key"+(haveReacts.size()-1),x );

//儲存圖片
tables.add(src.submat(boundRect[i]).clone());
//將矩形畫在原圖上
Imgproc.rectangle(src, boundRect[i].tl(), boundRect[i].br(), new Scalar(255, 0, 255), 1, 8, 0);

}

//頁面資料
Map<String,String> jspdata=new HashMap<String, String>();

for(int i=0; i< tables.size(); i++ ){ Mat table = tables.get(i); Rect rect = haveReacts.get(i);
int width = rect.width,height=rect.height;
Map<String, Map<String, Double>> mapdata = mappoint.get("key"+i);
int[] x_z = maptoint(mapdata.get("x"));
int[] y_z = maptoint(mapdata.get("y"));

//縱切
String px_biao = request.getParameter("x_biao"); if(px_biao==null||px_biao.equals("") ){ px_biao="5"; }
int x_len=0,x_biao=Integer.parseInt(px_biao);
List<Mat> mats=new ArrayList<Mat>();
for (int j = 0; j < x_z.length; j++) {
if(j==0){
Mat img=new Mat(table,new Rect(0,0,x_z[j],height ));if(img.cols()>x_biao ){ mats.add(img); x_len++;}
}else{
Mat img=new Mat(table,new Rect(x_z[j-1],0,x_z[j]-x_z[j-1],height )); if(img.cols()>x_biao ){mats.add(img);x_len++;}
if(j==x_z.length-1){//最後一個處理
Mat img1=new Mat(table,new Rect(x_z[x_z.length-1],0,width-x_z[x_z.length-1],height )); if(img.cols()>x_biao ){mats.add(img1); }
}
}
}
imshow(basePath,table,"table_"+i+".png");//當前table圖
//橫切儲存
String py_biao = request.getParameter("y_biao"); if(py_biao==null||py_biao.equals("") ){ py_biao="5"; }
int y_len=0,y_biao=Integer.parseInt(py_biao );
for (int j = 0; j <mats.size() ; j++) { Mat mat = mats.get(j);
int tuwidth = mat.cols(),tugao=mat.rows();
int cy_len=0;
for (int k = 0; k < y_z.length; k++) {
if(k==0){
Mat img=new Mat(mat,new Rect(0,0,tuwidth , y_z[k] ));if(img.rows()>y_biao ){ imshow(basePath, img,"table_"+i+"_"+j+"_"+cy_len+".png"); cy_len++; }
}else{
Mat img=new Mat(mat,new Rect(0,y_z[k-1],tuwidth,y_z[k]-y_z[k-1]));if(img.rows()>y_biao ){ imshow(basePath, img,"table_"+i+"_"+j+"_"+cy_len+".png"); cy_len++;}
if(k==y_z.length-1){//最後一個處理
Mat img1=new Mat(mat,new Rect(0,y_z[k],tuwidth,tugao-y_z[k] ));if(img.rows()>y_biao ){ imshow(basePath, img1,"table_"+i+"_"+j+"_"+(cy_len)+".png"); }
}
}
}
y_len=cy_len;
}
//儲存資料資訊
jspdata.put("table_"+i, x_len+"_"+y_len);
}
request.setAttribute("data", jspdata);

//百度識別處理
AipOcr client = new AipOcr(APP_ID, API_KEY, SECRET_KEY);
// 可選：設定網路連線引數
client.setConnectionTimeoutInMillis(2000);
client.setSocketTimeoutInMillis(60000);

Map<String,String> jspdata1=new HashMap<String, String>();
int num=0;
for (Map.Entry<String, String> d : jspdata.entrySet()) {
String value= d.getValue();
if(value.indexOf("_")!=-1){
//
String x="";
String len[]=value.split("_");
int xlen=Integer.parseInt(len[0]);int ylen=Integer.parseInt(len[1]);
for(int i=0;i<ylen;i++){
//行
for(int j=0;j<xlen;j++){
String name="table_"+num+"_"+j+"_"+i+".png";
JSONObject res = client.basicGeneral(basePath+"/"+name, new HashMap<String, String>());
String text="";
try {
Object words_result = res.get("words_result");
JSONArray array=(JSONArray) words_result;
text=getjsontext(array);
} catch (Exception e) { System.out.println("cuowu");}
try {
Thread.sleep(400);//百度qps限制
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
x+=(text.equalsIgnoreCase("")?" ":text)+"&_&";
}
x=x.substring(0, x.lastIndexOf("&_&"));
x+="#_#";
}
//
jspdata1.put("shibie"+num, x);
}
num++;
}
long endTime=System.currentTimeMillis();
request.setAttribute("time", (float)(endTime-startTime)/1000);
request.setAttribute("shibiedata", jspdata1);
request.getRequestDispatcher("tutableread.jsp").forward(request,response);
}
public void imshow(String basePath,Mat dst,String name) {
Imgcodecs.imwrite(basePath+"/"+name, dst);
}
public String getjsontext(JSONArray array){
String text="";
for (int i = 0; i < array.length(); i++) { JSONObject textx = (JSONObject)array.get(i); text+=textx.get("words"); }
return text;
}
public int[] maptoint(Map<String, Double> x) {
int[] zhi=new int[x.size()];int num=0;
for (Map.Entry<String, Double> m :x.entrySet()) {
zhi[num]=m.getValue().intValue(); num++;
}
Arrays.sort(zhi);
return zhi;
}

}

效果圖（我主要獲取資料的單元格所以拆分的比較大體沒對合並的單元格處理哦）

參考文章

https://my.oschina.net/u/3767256/blog/1615720

https://blog.csdn.net/yomo127/article/details/52045146

如果你對此感興趣的可以加群261074724討論

java+OpenCV3 +百度OCR(或tesseract) 識別表格資料

java+OpenCV3 +百度OCR(或tesseract) 識別表格資料

Java文字識別軟體-呼叫百度ocr實現文字識別

JAVA實現百度OCR文字識別功能

使用百度ocr接口識別驗證碼

Java呼叫百度API實現文字識別-羅紹崗-專題視訊課程

[百度OCR]BDOcr.getText(識別圖上的文字）

C#百度OCR-身份證圖片識別提取資訊

百度OCR驗證碼識別連線

tp5引入百度ocr實現文字識別

Java實現百度雲OCR介面識別圖片文字資訊（也包含身份證，銀行卡識別,更新新增通用票據識別）

借助百度OCR，實現一鍵識別圖片中文字，就是這麽酷！

百度ocr文字識別接口使用

用百度ocr+微信截圖實現文字識別

java呼叫百度AI識別圖片上的文字功能

java 根據圖片url識別圖片簡單例項(百度雲通用文字識別)

Java-基於百度API的圖片文字識別（支援中文，英文和中英文混合）

Java-基於百度API的圖片文字識別（支援中文，英文和中英文混合）貨運APP開發找上海捌躍網路科技有限公司

百度OCR文字識別image format error問題

Java實現百度雲文字識別介面程式碼

百度OCR文字識別企業版 Object-C 識別兩種型別程式碼示例

java+OpenCV3 +百度OCR(或tesseract) 識別表格資料

相關推薦