1. 程式人生 > >lucene建立pdf檔案內容全文索引

lucene建立pdf檔案內容全文索引

第一次寫部落格,有很多不足,也許詞不達意,也許有其他問題,作為一個新手,第一次使用lucene,貼出程式碼的目的是為了交流。

idea maven專案,pom依賴:

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.8</version>
</dependency>
<!--PDF加密和簽名-->
<dependency>
<groupId>org.bouncycastle</groupId> <artifactId>bcprov-jdk15</artifactId> <version>1.44</version> </dependency>
下面是用到的包
<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itextpdf</artifactId>
    <version>
5.5.10</version> </dependency> <dependency> <groupId>com.itextpdf.tool</groupId> <artifactId>xmlworker</artifactId> <version>5.5.9</version> </dependency> <dependency> <groupId>com.itextpdf</groupId> <artifactId>
itext-asian</artifactId> <version>5.2.0</version> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>1.8.10</version> </dependency>
下面是程式碼實現:
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.store.FSDirectory;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.Date;
import static net.gddata.common.util.StringUtil.getString;
/**
 * Created by Ren on 18/1/25.
 */
@Component
public class IndexUtil {

    public IndexReader getIndexReaderByFSD(String path) {
        FSDirectory fsDirectory = null;
        try {
            fsDirectory = FSDirectory.open(Paths.get(path));
            return DirectoryReader.open(fsDirectory);
} catch (IOException e) {
            System.out.println("初始化索引目錄失敗!(IndexReaderUtil.java)" + e.toString());
e.printStackTrace();
}
        return null;
}
    public void createIndex(String filePath, String indexPath) throws IOException {
        // 建立一個簡單的分詞器,可以對資料進行分詞
Analyzer analyzer = new StandardAnalyzer();
IndexWriter indexWriter = null;
        if (null == indexWriter) {
            String path = indexPath + "/" + createIndexDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
File index = new File(path);
            try {
                if (!index.exists()) {// 如果目錄不存在
index.mkdirs();// 建立資料夾
}
                FSDirectory dir = FSDirectory.open(Paths.get(path));
LogDocMergePolicy mergePolicy = new LogDocMergePolicy();
mergePolicy.setMinMergeDocs(1000);
iwc.setMaxBufferedDocs(20000);
iwc.setMergePolicy(mergePolicy);
indexWriter = new IndexWriter(dir, iwc);
} catch (Exception e) {
                System.out.println("建立檔案錯誤" + e);
e.printStackTrace();
}
        }
        // 獲取所有需要建立索引的檔案
File[] files = new File(filePath).listFiles();
        for (int i = 0; i < files.length; i++) {
            if (files[i].isDirectory()){ //判斷如果是不是檔案,則跳過繼續其他檔案迴圈
continue;
}
            // 檔案是第幾個
System.out.println("這是第" + i + "個檔案----------------");
// 檔案的完整路徑
System.out.println("完整路徑:" + files[i].toString());
// 獲取檔名稱
String fileName = files[i].getName();
// 獲取檔案字尾名,將其作為檔案型別
String fileType = fileName.substring(fileName.lastIndexOf(".") + 1,
fileName.length()).toLowerCase();
// 檔名稱
System.out.println("檔名稱:" + fileName);
// 檔案型別
System.out.println("檔案型別:" + fileType);
InputStream in = new FileInputStream(files[i]);
            if (fileType != null && !fileType.equals("")) {
                if (fileType.equals("pdf")) {
                    // 獲取pdf文件
PDFParser parser = new PDFParser(in);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
System.out.println("page==" + pdDocument.getNumberOfPages());
                    int numberOfPages = pdDocument.getNumberOfPages();
                    if (numberOfPages > 0) {
                        for (int j = 1; j < numberOfPages; j++) {
                            Document doc = new Document();
PDFTextStripper stripper = new PDFTextStripper();
//設定是否排序
stripper.setSortByPosition(true);
//設定起始頁
stripper.setStartPage(j);
//設定結束頁
stripper.setEndPage(j);
System.out.println("content==" + stripper.getText(pdDocument));
// 建立Field物件,並放入doc物件中
doc.add(new TextField("contents", stripper.getText(pdDocument),
Field.Store.YES));
doc.add(new TextField("page", getString(j),
Field.Store.YES));
doc.add(new TextField("filepath", files[i].getAbsolutePath(),
Field.Store.YES));
// 建立檔名的域,並放入doc物件中
doc.add(new StringField("filename", files[i].getName(), Field.Store.YES));
// 寫入IndexWriter
indexWriter.addDocument(doc);
// 換行
System.out.println();
}
                        indexWriter.commit();
}
                    // 關閉文件
pdDocument.close();
System.out.println("注意:已為檔案“" + fileName + "”建立了索引");
} else {
                    System.out.println();
                    continue;
}
            }
        }
        // 檢視IndexWriter裡面有多少個索引
System.out.println("numDocs=" + indexWriter.numDocs());
// 關閉索引
indexWriter.close();
}

    public static String createIndexDirectory() {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
        return sdf.format(new Date());
}

}