java讀取pdf檔案內容
阿新 • • 發佈:2021-11-29
使用JAVA從PDF中獲取文字資訊,目前只能讀取文字型PDF。圖片型PDF尚在研究
1.匯入Maven依賴
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
</dependency>
2.示例程式碼
package com.example.pdfuntil; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripperByArea; import java.io.File; import java.io.FileWriter; import java.io.IOException; public class ReadPdf { public static void main(String[] args) { String inputPath = "C:/example/PdfTest/pdf_demo1.pdf"; String outputPath = "C:/example/PdfTest/demo.txt"; readPdf(inputPath, outputPath); } public static void readPdf(String inputPath, String outputPath) { try (PDDocument document = PDDocument.load(new File(inputPath))) { if (!document.isEncrypted()) { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); PDFTextStripper tStripper = new PDFTextStripper(); String pdfFileInText = tStripper.getText(document); String[] lines = pdfFileInText.split("\\r?\\n"); FileWriter fw = new FileWriter(outputPath); for (String line : lines) { System.out.println(line); fw.write(line + "\n"); } fw.close(); document.close(); } } catch (IOException e) { e.printStackTrace(); } } }