lucene建立pdf檔案內容全文索引
阿新 • • 發佈:2019-02-10
第一次寫部落格,有很多不足,也許詞不達意,也許有其他問題,作為一個新手,第一次使用lucene,貼出程式碼的目的是為了交流。
idea maven專案,pom依賴:
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.8</version>
</dependency>
<!--PDF加密和簽名-->
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcprov-jdk15</artifactId>
<version>1.44</version>
</dependency>
下面是用到的包
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version> 5.5.10</version>
</dependency>
<dependency>
<groupId>com.itextpdf.tool</groupId>
<artifactId>xmlworker</artifactId>
<version>5.5.9</version>
</dependency>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId> itext-asian</artifactId>
<version>5.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>1.8.10</version>
</dependency>
下面是程式碼實現:
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.*; import org.apache.lucene.store.FSDirectory; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.springframework.stereotype.Component; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.Paths; import java.text.SimpleDateFormat; import java.util.Date; import static net.gddata.common.util.StringUtil.getString; /** * Created by Ren on 18/1/25. */ @Component public class IndexUtil { public IndexReader getIndexReaderByFSD(String path) { FSDirectory fsDirectory = null; try { fsDirectory = FSDirectory.open(Paths.get(path)); return DirectoryReader.open(fsDirectory); } catch (IOException e) { System.out.println("初始化索引目錄失敗!(IndexReaderUtil.java)" + e.toString()); e.printStackTrace(); } return null; }
public void createIndex(String filePath, String indexPath) throws IOException { // 建立一個簡單的分詞器,可以對資料進行分詞 Analyzer analyzer = new StandardAnalyzer(); IndexWriter indexWriter = null; if (null == indexWriter) { String path = indexPath + "/" + createIndexDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); File index = new File(path); try { if (!index.exists()) {// 如果目錄不存在 index.mkdirs();// 建立資料夾 } FSDirectory dir = FSDirectory.open(Paths.get(path)); LogDocMergePolicy mergePolicy = new LogDocMergePolicy(); mergePolicy.setMinMergeDocs(1000); iwc.setMaxBufferedDocs(20000); iwc.setMergePolicy(mergePolicy); indexWriter = new IndexWriter(dir, iwc); } catch (Exception e) { System.out.println("建立檔案錯誤" + e); e.printStackTrace(); } } // 獲取所有需要建立索引的檔案 File[] files = new File(filePath).listFiles(); for (int i = 0; i < files.length; i++) { if (files[i].isDirectory()){ //判斷如果是不是檔案,則跳過繼續其他檔案迴圈 continue; } // 檔案是第幾個 System.out.println("這是第" + i + "個檔案----------------"); // 檔案的完整路徑 System.out.println("完整路徑:" + files[i].toString()); // 獲取檔名稱 String fileName = files[i].getName(); // 獲取檔案字尾名,將其作為檔案型別 String fileType = fileName.substring(fileName.lastIndexOf(".") + 1, fileName.length()).toLowerCase(); // 檔名稱 System.out.println("檔名稱:" + fileName); // 檔案型別 System.out.println("檔案型別:" + fileType); InputStream in = new FileInputStream(files[i]); if (fileType != null && !fileType.equals("")) { if (fileType.equals("pdf")) { // 獲取pdf文件 PDFParser parser = new PDFParser(in); parser.parse(); PDDocument pdDocument = parser.getPDDocument(); System.out.println("page==" + pdDocument.getNumberOfPages()); int numberOfPages = pdDocument.getNumberOfPages(); if (numberOfPages > 0) { for (int j = 1; j < numberOfPages; j++) { Document doc = new Document(); PDFTextStripper stripper = new PDFTextStripper(); //設定是否排序 stripper.setSortByPosition(true); //設定起始頁 stripper.setStartPage(j); //設定結束頁 stripper.setEndPage(j); System.out.println("content==" + stripper.getText(pdDocument)); // 建立Field物件,並放入doc物件中 doc.add(new TextField("contents", stripper.getText(pdDocument), Field.Store.YES)); doc.add(new TextField("page", getString(j), Field.Store.YES)); doc.add(new TextField("filepath", files[i].getAbsolutePath(), Field.Store.YES)); // 建立檔名的域,並放入doc物件中 doc.add(new StringField("filename", files[i].getName(), Field.Store.YES)); // 寫入IndexWriter indexWriter.addDocument(doc); // 換行 System.out.println(); } indexWriter.commit(); } // 關閉文件 pdDocument.close(); System.out.println("注意:已為檔案“" + fileName + "”建立了索引"); } else { System.out.println(); continue; } } } // 檢視IndexWriter裡面有多少個索引 System.out.println("numDocs=" + indexWriter.numDocs()); // 關閉索引 indexWriter.close(); } public static String createIndexDirectory() { SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss"); return sdf.format(new Date()); } }