pdf轉圖片、提取pdf文字、提取pdf圖片
阿新 • • 發佈:2019-01-08
package com.midevip.common.util; | |
import com.itextpdf.text.pdf.PdfReader; | |
import net.coobird.thumbnailator.Thumbnails; | |
import org.apache.pdfbox.cos.COSName; | |
import org.apache.pdfbox.pdmodel.*; |
|
import org.apache.pdfbox.pdmodel.encryption.AccessPermission; | |
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; | |
import org.apache.pdfbox.rendering.PDFRenderer; | |
import org.apache.pdfbox.text.PDFTextStripper; |
|
import javax.imageio.IIOImage; | |
import javax.imageio.ImageIO; | |
import javax.imageio.ImageWriteParam; | |
import javax.imageio.ImageWriter; | |
import java.awt.image.BufferedImage; |
|
import java.io.File; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.text.SimpleDateFormat; | |
import java.util.Calendar; | |
import java.util.Iterator; | |
/** | |
* 使用pdfbox提取pdf文件的文字和圖片內容 | |
* pdfbox官網:https://pdfbox.apache.org/ | |
* maven依賴如下: | |
* <dependency> | |
* <groupId>org.apache.pdfbox</groupId> | |
* <artifactId>fontbox</artifactId> | |
* <version>2.0.1</version> | |
* </dependency> | |
* <dependency> | |
* <groupId>org.apache.pdfbox</groupId> | |
* <artifactId>pdfbox</artifactId> | |
* <version>2.0.1</version> | |
* </dependency> | |
* <dependency> | |
* <groupId>com.itextpdf</groupId> | |
* <artifactId>itextpdf</artifactId> | |
* <version>5.5.13</version> | |
* </dependency> | |
* <dependency> | |
* <groupId>net.coobird</groupId> | |
* <artifactId>thumbnailator</artifactId> | |
* <version>0.4.8</version> | |
* </dependency> | |
*/ | |
public class PdfTest { | |
public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; | |
/** | |
* | |
* | |
* @param pdfFilePath | |
* @throws Exception | |
*/ | |
public static void extractText(String pdfFilePath) throws Exception{ | |
try (PDDocument document = PDDocument.load(new File(pdfFilePath))) | |
{ | |
AccessPermission ap = document.getCurrentAccessPermission(); | |
if (!ap.canExtractContent()) | |
{ | |
throw new IOException("You do not have permission to extract text"); | |
} | |
PDFTextStripper stripper = new PDFTextStripper(); | |
stripper.setSortByPosition(true); | |
for (int p = 1; p <= document.getNumberOfPages(); ++p) | |
{ | |
// 這裡分為一頁一頁的提取,如果不設定,預設會把所有頁的內容一次性提取出來,根據需要選擇 | |
stripper.setStartPage(p); | |
stripper.setEndPage(p); | |
//提取內容就這一行程式碼 | |
//提取內容很徹底,包括了頁首頁尾的內容也都會被提出來 | |
String text = stripper.getText(document); | |
String pageStr = String.format("page %d:", p); | |
System.out.println(pageStr); | |
//為了打印出來更美觀 | |
for (int i = 0; i < pageStr.length(); ++i) | |
{ | |
System.out.print("-"); | |
} | |
System.out.println(); | |
System.out.println(text.trim()); | |
System.out.println(); | |
} | |
} | |
} | |
public static void pdfParse(String pdfPath) throws Exception { | |
InputStream input = null; | |
PDDocument document = null; | |
try { | |
document = PDDocument.load(new File(pdfPath)); | |
/** 文件屬性資訊 **/ | |
PDDocumentInformation info = document.getDocumentInformation(); | |
System.out.println("標題:" + info.getTitle()); | |
System.out.println("主題:" + info.getSubject()); | |
System.out.println("作者:" + info.getAuthor()); | |
System.out.println("關鍵字:" + info.getKeywords()); | |
System.out.println("應用程式:" + info.getCreator()); | |
System.out.println("pdf 製作程式:" + info.getProducer()); | |
System.out.println("作者:" + info.getTrapped()); | |
System.out.println("建立時間:" + dateFormat(info.getCreationDate())); | |
System.out.println("修改時間:" + dateFormat(info.getModificationDate())); | |
//獲取內容資訊 | |
PDFTextStripper pts = new PDFTextStripper(); | |
String content = pts.getText(document); | |
System.out.println("內容:" + content); | |
/** 文件頁面資訊 **/ | |
PDDocumentCatalog cata = document.getDocumentCatalog(); | |
int count = 1; | |
for (int i = 0; i < document.getNumberOfPages(); i++) { | |
PDPage page = document.getPage(i); | |
if (null != page) { | |
//獲取到所有rescourse資訊 | |
PDResources res = page.getResources(); | |
Iterable<COSName> xit = res.getXObjectNames(); | |
Iterator<COSName> iterator = xit.iterator(); | |
while (iterator.hasNext()){ | |
COSName cosName = iterator.next(); | |
System.out.println(cosName.getName()); | |
//判斷是否圖片資源,這個提取圖片也很徹底,包括頁首頁尾的圖片也會被獲取到 | |
if(res.isImageXObject(cosName)){ | |
PDImageXObject pdImageXObject = (PDImageXObject)res.getXObject(cosName); | |
//這裡儲存圖片我用了谷歌的thumbnailator框架,也可以用自己的方法去儲存BufferedImage物件到本地圖片 | |
Thumbnails.of(pdImageXObject.getImage()).scale(0.9).toFile(new File("D:\\pdf\\"+System.currentTimeMillis()+".jpg")); | |
} | |
} | |
} | |
} | |
} catch (Exception e) { | |
throw e; | |
} finally { | |
if (null != input) | |
input.close(); | |
if (null != document) | |
document.close(); | |
} | |
} | |
/*** | |
* PDF檔案轉PNG圖片,全部頁數 | |
* | |
* @param PdfFilePath pdf完整路徑 | |
* @param dpi dpi越大轉換後越清晰,相對轉換速度越慢 | |
* @return | |
*/ | |
private static boolean pdf2Image(String PdfFilePath, String dstImgFolder, int dpi) { | |
File file = new File(PdfFilePath); | |
PDDocument pdDocument; | |
try { | |
String imgPDFPath = file.getParent(); | |
int dot = file.getName().lastIndexOf('.'); | |
String imagePDFName = file.getName().substring(0, dot); // 獲取圖片檔名 | |
String imgFolderPath = null; | |
if (dstImgFolder.equals("")) { | |
imgFolderPath = imgPDFPath + File.separator + imagePDFName;// 獲取圖片存放的資料夾路徑 | |
} else { | |
imgFolderPath = dstImgFolder + File.separator + imagePDFName; | |
} | |
if (createDirectory(imgFolderPath)) { | |
pdDocument = PDDocument.load(file); | |
PDFRenderer renderer = new PDFRenderer(pdDocument); | |
/* dpi越大轉換後越清晰,相對轉換速度越慢 */ | |
PdfReader reader = new PdfReader(PdfFilePath); | |
int pages = reader.getNumberOfPages(); | |
StringBuffer imgFilePath = null; | |
for (int i = 0; i < pages; i++) { | |
String imgFilePathPrefix = imgFolderPath + File.separator + imagePDFName; | |
imgFilePath = new StringBuffer(); | |
imgFilePath.append(imgFilePathPrefix); | |
imgFilePath.append("_"); | |
imgFilePath.append(String.valueOf(formatNumber(i+1))); | |
imgFilePath.append(".jpg"); | |
File dstFile = new File(imgFilePath.toString()); | |
BufferedImage image = renderer.renderImageWithDPI(i, dpi); | |
ImageWriter writer = ImageIO.getImageWritersByFormatName("jpg").next(); | |
writer.setOutput(ImageIO.createImageOutputStream(dstFile)); | |
ImageWriteParam param = writer.getDefaultWriteParam(); | |
param.setCompressionMode(ImageWriteParam.MODE_EXPLICIT); | |
param.setCompressionQuality(0.3f); | |
writer.write(null, new IIOImage(image, null, null), param); | |
// ImageIO.write(image, "jpg", dstFile); | |
} | |
System.out.println("PDF文件轉圖片成功!"+dstImgFolder); | |
return true; | |
} else { | |
System.out.println("PDF文件轉圖片失敗:" + "建立" + imgFolderPath + "失敗"); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return false; | |
} | |
private static String formatNumber(int i){ | |
if(i<10){ | |
return "00"+i; | |
}else if(i<100){ | |
return "0"+i; | |
}else{ | |
return i+""; | |
} | |
} | |
private static boolean createDirectory(String folder) { | |
File dir = new File(folder); | |
if (dir.exists()) { | |
return true; | |
} else { | |
return dir.mkdirs(); | |
} | |
} | |
public static String dateFormat(Calendar calendar) throws Exception { | |
if (null == calendar) | |
return null; | |
String date = null; | |
try { | |
String pattern = DATE_FORMAT; | |
SimpleDateFormat format = new SimpleDateFormat(pattern); | |
date = format.format(calendar.getTime()); | |
} catch (Exception e) { |