1. 程式人生 > >Lucene 實現txt檔案的構建索引與查詢

Lucene 實現txt檔案的構建索引與查詢

package net.jqsoft.hecv.util;
import net.sf.json.JSONArray;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import
org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import
org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import java.io.*; import java.util.ArrayList; import java.util.List; /** * Created by tianhj on 2017/6/14. */ public class LuceneTest { private String indexPath="F:\\lucene\\luceneIndex";//索引存放路徑 private String dataPath="F:\\lucene\\luceneData";//txt檔案所在路徑 private Analyzer analyzer = new IKAnalyzer();//IK中文分詞器 private IndexWriter indexWriter;//索引器 private Directory directory;//索引庫 private static final String STARTTAG = "<";//高亮開始 private static final String ENDTAG = ">";//高亮結束 public static void main(String[] args) { LuceneTest luceneTest=new LuceneTest(); //luceneTest.buildTxtIndex(); luceneTest.searchTxt("勒索病毒手機"); } /** * Lucene檢索 * @param text 關鍵詞 */ public void searchTxt(String text){ JSONArray array = new JSONArray(); try{ directory= FSDirectory.open(new File(indexPath)); DirectoryReader ireader = DirectoryReader.open(directory);//開啟儲存位置 IndexSearcher searcher = new IndexSearcher(ireader);//建立搜尋器 SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( STARTTAG, ENDTAG);//查詢結果高亮轉換器 QueryParser parser = new QueryParser(Version.LUCENE_43, "content", analyzer);//查詢解析器,設定Lucene版本、要查詢的Field、分詞器 Query query = null; try { query = parser.parse(text); } catch (ParseException e) { e.printStackTrace(); } Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); TokenStream tokenStream = null; ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; Document doc; for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; doc = searcher.doc(docId); String content=doc.get("content").trim(); tokenStream = analyzer.tokenStream("content", new StringReader(content)); content = highlighter.getBestFragment(tokenStream, content); System.out.print((i+1)+": 檔名稱"+doc.get("filename")); System.out.println("---------檔案內容:"+content); } ireader.close(); }catch (Exception e){ } } /** * 建立索引 */ public void buildTxtIndex(){ try { directory= FSDirectory.open(new File(indexPath)); indexWriter=getIndexWriter(directory); indexWriter.deleteAll();//清空所有索引庫 } catch(Exception e) { System.out.println("索引開啟異常!"); } List<File> fileList = getFileList(dataPath); Document document = null; try{ for(File file:fileList){ document = fileToDocument(file); indexWriter.addDocument(document); System.out.println("filename:"+document.get("filename")+";content:"+document.get("content")); } indexWriter.commit(); closeWriter(); }catch (Exception e){ } } /** * 獲得indexwriter物件 * @param dir * @return * @throws Exception */ public IndexWriter getIndexWriter(Directory dir) throws Exception { IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_43, analyzer); return new IndexWriter(dir, iwc); } /** * 關閉indexwriter物件 * @throws Exception */ public void closeWriter() throws Exception { if(indexWriter != null) { indexWriter.close(); } } /** * 將檔案轉換成Document物件 * @param file * @return * @throws Exception */ public Document fileToDocument(File file) throws Exception { Document document=new Document(); document.add(new Field("filename", file.getName(), TextField.TYPE_STORED)); document.add(new Field("content", getFileContent(file), TextField.TYPE_STORED)); document.add(new Field("size", file.getTotalSpace()+"", TextField.TYPE_STORED)); return document; } /** * 讀取檔案內容 * @param file * @return * @throws Exception */ public String getFileContent(File file) throws Exception{ InputStreamReader reader = new InputStreamReader(new FileInputStream(file),"GBK"); BufferedReader br = new BufferedReader(reader); StringBuilder result = new StringBuilder(); String lineTxt = null; while((lineTxt = br.readLine()) != null){ result.append(lineTxt); } br.close(); reader.close(); return result.toString(); } /** * 獲得所有txt檔案 * @param dirPath * @return */ public List<File> getFileList(String dirPath) { File[] files=new File(dirPath).listFiles(); List<File> fileList=new ArrayList<File>(); for(File file: files) { if(isTxtFile(file.getName())) { fileList.add(file); } } return fileList; } /** * 判斷是否是txt檔案 * @param fileName * @return */ public boolean isTxtFile(String fileName) { if(fileName.lastIndexOf(".txt") > 0) { return true; } return false; } }