Lucene 實現txt檔案的構建索引與查詢

阿新 • • 發佈：2019-02-17

package net.jqsoft.hecv.util;
import net.sf.json.JSONArray;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
 
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import  
org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import  
org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
 * Created by tianhj on 2017/6/14.
 */
public class LuceneTest {

    private String indexPath="F:\\lucene\\luceneIndex";//索引存放路徑
private String dataPath="F:\\lucene\\luceneData";//txt檔案所在路徑
private Analyzer analyzer = new IKAnalyzer();//IK中文分詞器
private IndexWriter indexWriter;//索引器
private Directory directory;//索引庫
private static final String STARTTAG = "<";//高亮開始
private static final String ENDTAG = ">";//高亮結束
public static void main(String[] args) {
        LuceneTest luceneTest=new LuceneTest();
//luceneTest.buildTxtIndex();
luceneTest.searchTxt("勒索病毒手機");
}

    /**
     * Lucene檢索
     * @param text 關鍵詞
     */
public void searchTxt(String text){
        JSONArray array = new JSONArray();
        try{
            directory= FSDirectory.open(new File(indexPath));
DirectoryReader ireader = DirectoryReader.open(directory);//開啟儲存位置
IndexSearcher searcher = new IndexSearcher(ireader);//建立搜尋器
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(
                    STARTTAG, ENDTAG);//查詢結果高亮轉換器
QueryParser parser = new QueryParser(Version.LUCENE_43, "content", analyzer);//查詢解析器，設定Lucene版本、要查詢的Field、分詞器
Query query = null;
            try {
                query = parser.parse(text);
} catch (ParseException e) {
                e.printStackTrace();
}
            Highlighter highlighter = new Highlighter(simpleHtmlFormatter,
                    new QueryScorer(query));
TokenStream tokenStream = null;
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
Document doc;
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
doc = searcher.doc(docId);
String content=doc.get("content").trim();
tokenStream = analyzer.tokenStream("content", new StringReader(content));
content = highlighter.getBestFragment(tokenStream, content);
System.out.print((i+1)+": 檔名稱"+doc.get("filename"));
System.out.println("---------檔案內容："+content);
}
            ireader.close();
}catch (Exception e){

        }
    }

    /**
     * 建立索引
     */
public void buildTxtIndex(){
        try {
            directory= FSDirectory.open(new File(indexPath));
indexWriter=getIndexWriter(directory);
indexWriter.deleteAll();//清空所有索引庫
} catch(Exception e) {
            System.out.println("索引開啟異常！");
}

        List<File> fileList = getFileList(dataPath);
Document document = null;
        try{
            for(File file:fileList){
                document = fileToDocument(file);
indexWriter.addDocument(document);
System.out.println("filename："+document.get("filename")+";content："+document.get("content"));
}
            indexWriter.commit();
closeWriter();
}catch (Exception e){

        }

    }

    /**
     * 獲得indexwriter物件
     * @param dir
* @return
* @throws Exception
     */
public IndexWriter getIndexWriter(Directory dir) throws Exception {
        IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_43, analyzer);
        return new IndexWriter(dir, iwc);
}

    /**
     * 關閉indexwriter物件
     * @throws Exception
     */
public void closeWriter() throws Exception {
        if(indexWriter != null) {
            indexWriter.close();
}
    }

    /**
     * 將檔案轉換成Document物件
     * @param file
* @return
* @throws Exception
     */
public Document fileToDocument(File file) throws Exception {
        Document document=new Document();
document.add(new Field("filename", file.getName(), TextField.TYPE_STORED));
document.add(new Field("content", getFileContent(file), TextField.TYPE_STORED));
document.add(new Field("size", file.getTotalSpace()+"", TextField.TYPE_STORED));
        return document;
}

    /**
     * 讀取檔案內容
     * @param file
* @return
* @throws Exception
     */
public String getFileContent(File file) throws Exception{
        InputStreamReader reader = new InputStreamReader(new FileInputStream(file),"GBK");
BufferedReader br = new BufferedReader(reader);
StringBuilder result = new StringBuilder();
String lineTxt = null;
        while((lineTxt = br.readLine()) != null){
            result.append(lineTxt);
}
        br.close();
reader.close();
        return result.toString();
}


    /**
     * 獲得所有txt檔案
     * @param dirPath
* @return
*/
public List<File> getFileList(String dirPath) {
        File[] files=new File(dirPath).listFiles();
List<File> fileList=new ArrayList<File>();
        for(File file: files) {
            if(isTxtFile(file.getName())) {
                fileList.add(file);
}
        }
        return fileList;
}

    /**
     * 判斷是否是txt檔案
     * @param fileName
* @return
*/
public boolean isTxtFile(String fileName) {
        if(fileName.lastIndexOf(".txt") > 0) {
            return true;
}
        return false;
}
}

Lucene 實現txt檔案的構建索引與查詢

package net.jqsoft.hecv.util; import net.sf.json.JSONArray; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis

vue+element+springboot實現.txt檔案上傳

前端html程式碼 <div class="p-upload-box"> <form action='dic/uploadWord' enctype='multipart/form-data' method='post' id="fileUpl

IR中python 寫倒排索引與查詢處理

學習資訊檢索課程，老師讓寫一個倒排索引與查詢處理的程式，於是抱著試試的心態自學python寫了出來。整個沒有什麼太大的演算法技巧，唯一的就是查詢處理那裡遞迴函式正反兩次反覆查詢需要多除錯下。資料結構： #-*-coding:utf-8-*- #!/usr/bin/pyt

MongoDB的索引與查詢優化

MongoDB的索引的機制與普通資料庫基本相似，主要有如下幾部分：單欄位索引 MongoDB預設為所有集合建立了一個_id欄位的單欄位索引，該索引唯一，且不能刪除（_id為集合的主鍵）索引的建立方法： db.customers.ensureInd

java web開發實現properties檔案的讀取與解析

在java web 開發的過程當中，由於涉及到附件的上傳，這樣就必然需要設定檔案的路徑，如果在程式碼中寫死檔案上傳的路徑，必然是不合理的。那麼通常的做法就是將相關設定放在配置檔案當中，

簡易實現, txt檔案顯示在html上

<textarea data-dojo-attach-point="txtViewForm" style="font-ize: 15px; width: 100%;height: 100%" readonly="readonly" > this.txtViewForm.value

lucene4.8.0 + IKAnalyzer5.0.1 建立索引與查詢demo

主要程式碼：建立索引： public void createIndex(){ try { // 有檔案系統或者記憶體儲存方式,這裡使用檔案系統儲存索引資料 Directory directory = new SimpleFSDirectory(new Fi

MapReduce程式設計實現txt檔案中的內容匯入HBase

一、建立java專案。寫入程式碼，如下： [java] view plain copy print? package translate1; import java.io.IOException; import org.apache.hadoo

C++ 實現txt檔案的讀取

最近臨時接到專案，加緊學習了一下C++，只是簡單的檔案的讀取就弄了好久的說~~ 現在特意分享一下，希望對小夥伴們會有幫助喔~~ 1. 實現txt檔案的讀入並重寫入另外一個txt檔案中~ #include<fstream> //ifstream #include

python 實現txt檔案按 value值排序從高到低

with open('sort.txt','w+') as w: while True: sorted_lines=sorted(open('1.txt'), key=lambda s: s.split()[4],reverse=1)

Lucene實現索引和查詢

ont termquery 文件夾移植指定安裝過程 buffer upd 遇到 0引言　　隨著萬維網的發展和大數據時代的到來，每天都有大量的數字化信息在生產、存儲、傳遞和轉化，如何從大量的信息中以一定的方式找到滿足自己需求的信息，使之有序化並加以利用成為一大難題。全

KD-tree的原理以及構建與查詢操作的python實現

前幾天小組討論會上展示了kd-tree（k－dimension tree），感覺這玩意兒還挺有用的，所以學習了一下它的原理，然後把其中的構建kd-tree以及對應的查詢操作實現了一下，現在跟大家分享一下首先說一下什麼是kd-tree把不過首先得說一下bst(二叉查詢樹)，遞迴定義如下：如果左子樹上的節點

Lucene 4.X 倒排索引原理與實現: (1) 詞典的設計

詞典的格式設計詞典中所儲存的資訊主要是三部分： Term字串 Term的統計資訊，比如文件頻率(Document Frequency) 倒排表的位置資訊其中Term字串如何儲存是一個很大的問題，根據上一章基本原理的表述中，我們知道，寫入檔案的Term是按照字典順序排好序的，那麼如何將這些

SpringCloud學習筆記024---SpringBoot整合Lucene實現全文檢索_分詞_索引_更新_刪除文件_詞條搜尋_多條件查詢

先看程式碼實現,下面有lucene介紹: 測試用例 Github 程式碼程式碼我已放到 Github ，匯入spring-boot-lucene-demo 專案新增依賴  <dependency>

Lucene簡單實現建立索引以及查詢

package com.rdz.test; import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.Ana

索引的重建與查詢

where _id 錯誤信息 set 提高 ble cnblogs 指定表 physical sp_helpindex ‘tablename‘ ----------------------------------------------------------------

struts2實現檔案上傳與下載功能

一、Demo介紹基於struts2框架，實現多檔案的上傳和下載功能。實現原理圖：部分介面圖：上傳成功及下載頁面：二、主要程式碼 uploadFile.jsp:在form表單中包含一個文字框（上傳使用者的姓名）和兩個檔案上傳選項. <%@

python基於併發與socket實現遠端檔案傳輸程式

FTP程式 Client: * bin/start.py 程式入口 * conf/配置檔案存放 * core/ * auth.py 登陸，註冊以及上傳下載檢視當前資料夾下檔案以及刪除功能存放 * cline.py 與服務端通訊 * home 本地使用者目錄 Server： * bin/

關於myeclipse實現檔案上傳與使用的路徑問題

在檔案上傳的時候編寫檔案儲存應該儲存到 myeclipse 的workspace的工程目錄下面，而不是放到tomcat的webapps下面。否則eclipse 無法更新檔案。換句話講，在eclipse中新增檔案，comcat的專案檔案中可以看見新增的文體，但是反過來，在comcat的工程目錄下

資料結構——排序與查詢（2）——希爾排序（C++實現）

希爾排序原理希爾排序（Shell’s Sort）,也稱為“縮小增量排序”，是一種插入排序類的演算法。最簡單的插入排序，我在上一個專欄的一篇文章C++抽象程式設計——演算法分析（8）——插入排序演算法與分析有提到過，這裡就不再贅述，這裡就只介紹一些我以前沒寫過的演算法。希爾排序是一

Lucene 實現txt檔案的構建索引與查詢

相關推薦