利用pdfbox讀取pdf檔案內容和圖片
最近用pdfbox讀取pdf檔案中的內容和圖片,可以獲取每一頁的內容和圖片,但有個問題是沒法獲取圖片在頁面的位置。原始碼如下:
package com.util;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.util.PDFTextStripper;
public class PdfBoxUtil {
try {
InputStream inputStream = new BufferedInputStream(new FileInputStream(new File("D:/android/a.pdf")));
//PDFParser parser = new PDFParser( inputStream );
//parser.parse();
PDDocument pdfDocument = PDDocument.load(inputStream);
//PDDocument pdfDocument = parser.getPDDocument();
StringWriter writer = new StringWriter();
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfDocument, writer);
String contents = writer.getBuffer().toString();
/*
PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation();
System.out.println("標題:" + documentInformation.getTitle());
PDDocumentInformation info = pdfDocument.getDocumentInformation();
System.out.println("標題:" + info.getTitle());
System.out.println("主題:" + info.getSubject());
System.out.println("作者:" + info.getAuthor());
System.out.println("關鍵字:" + info.getKeywords());
System.out.println("應用程式:" + info.getCreator());
System.out.println("pdf 製作程式:" + info.getProducer());
System.out.println("作者:" + info.getTrapped());
System.out.println("建立時間:" + dateFormat(info.getCreationDate()));
System.out.println("修改時間:" + dateFormat(info.getModificationDate()));
*/
/** 文件頁面資訊 **/
PDDocumentCatalog cata = pdfDocument.getDocumentCatalog();
List pages = cata.getAllPages();
int count = 1;
for (int i = 0; i < pages.size(); i++) {
PDPage page = (PDPage) pages.get(i);
if (null != page) {
//本頁面文字內容
StringWriter sw = new StringWriter();
PDFTextStripper pst = new PDFTextStripper();
pst.setStartPage(i+1);
pst.setEndPage(i+1);
pst.writeText(pdfDocument, sw);
String content = sw.getBuffer().toString();
System.out.println(content);
PDResources res = page.findResources() ;
// 獲取頁面圖片資訊
Map imgs = res.getImages();
if (null != imgs) {
Set keySet = imgs.keySet();
Iterator it = keySet.iterator();
while (it.hasNext()) {
Object obj = it.next();
PDXObjectImage img = (PDXObjectImage) imgs.get(obj);
img.write2file("D:/" + count);
count++;
}
}
}
}
} catch (Exception e) {
// TODO 自動生成 catch 塊
e.printStackTrace();
}
}
public static String dateFormat(Calendar calendar) throws Exception {
if (null == calendar)
return null;
String date = null;
try {
String pattern = "yyyy-MM-dd";
SimpleDateFormat format = new SimpleDateFormat(pattern);
date = format.format(calendar.getTime());
} catch (Exception e) {
throw e;
}
return date == null ? "" : date;
}
}
最後實在沒辦法,只好將每一頁的內容轉換成圖片。替換如上紅色部分程式碼,將每一頁列印成圖片。
if (null != page) {
BufferedImage img1 = page.convertToImage();
File file = new File("D:/"+i+".PNG");
ImageIO.write(img1, "PNG", file);
}