lucene構建restful風格的簡單搜索引擎服務
來自於本人博客:
lucene構建restful風格的簡單搜索引擎服務
本人的博客如今也要改成使用lucene進行全文檢索的功能,因此在這裏把代碼貼出來與大家分享
一,文件夾結構:
二,配置文件:
總共同擁有四個配置文件:bonecp-config.xml,IKAnalyzer.cfg.xml,log4j.properties,system-config.xml
1.bonecp-config.xml是配置jdbc連接池用的,不用這個配置也行,bonecp包有默認配置
2.IKAnalyzer.cfg.xml是IKAnalyzer分詞要用的字典配置文件
這裏也能夠不用配置 <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 擴展配置</comment> <!--用戶能夠在這裏配置自己的擴展字典 --> <entry key="ext_dict">/data/lucene/dict/1_dict.txt;/data/lucene/dict/2_dict.txt;/data/lucene/dict/3_dict.txt;/data/lucene/dict/4_dict.txt;/data/lucene/dict/5_dict.txt;/data/lucene/dict/6_dict.txt;/data/lucene/dict/7_dict.txt;/data/lucene/dict/8_dict.txt;</entry> <!--用戶能夠在這裏配置自己的擴展停止詞字典 <entry key="ext_stopwords">/data/lucene/dict/stopword.dic</entry> --> </properties>
3.log4j.properties這個不用多說了
4.system-config.xml是一些系統的配置參數
<?xml version="1.0" encoding="UTF-8"?> <configs> <mysql> <port>3306</port> <user>test</user> <password>test</password> <partitionCount>6</partitionCount> <maxWait>3600</maxWait> <driverClass>com.mysql.jdbc.Driver</driverClass> <idleMaxAge>1800</idleMaxAge> <idleConnectionTestPeriod>300</idleConnectionTestPeriod> <host>jdbc:mysql://localhost/blog?characterEncode=UTF-8</host> </mysql> <search> <!--這裏的路徑能夠自己修改--> <indexPath>/data/lucene/index</indexPath> <recommendNetIndexPath>/data/lucene/index/recommendNet</recommendNetIndexPath> <searcNum>10</searcNum> <resultNum>10000</resultNum> </search> </configs>
三,監聽器SystemStartupListener,實現了ServletContextListener
package com.blog.listener; import java.io.File; import java.net.URL; import java.sql.SQLException; import java.util.List; import javax.servlet.ServletContextEvent; import javax.servlet.ServletContextListener; import org.apache.log4j.Logger; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import com.blog.db.DBFactory; import com.blog.search.BlogSearch; import com.blog.search.index.BlogIndex; public class SystemStartupListener implements ServletContextListener { private static Logger log = Logger.getLogger(SystemStartupListener.class); public void contextDestroyed(ServletContextEvent arg0) { DBFactory.shutDown(); } public void contextInitialized(ServletContextEvent arg0) { SAXReader reader = new SAXReader(); try { URL url = this.getClass().getClassLoader().getResource("system-config.xml"); String path = url.getFile(); Document doc = reader.read(new File(path)); Element rootEle = doc.getRootElement(); List list = rootEle.elements("mysql"); if(list.size() > 0) { Element mysqlEle = (Element) list.get(0); if(null != mysqlEle) { String host = mysqlEle.elementText("host"); String port = mysqlEle.elementText("port"); String user = mysqlEle.elementText("user"); String password = mysqlEle.elementText("password"); Integer partitionCount = Integer.parseInt(mysqlEle.elementText("partitionCount")); Integer maxWait = Integer.parseInt(mysqlEle.elementText("maxWait")); String driverClass = mysqlEle.elementText("driverClass"); Integer idleMaxAge = Integer.parseInt(mysqlEle.elementText("idleMaxAge")); Integer idleConnectionTestPeriod = Integer.parseInt(mysqlEle.elementText("idleConnectionTestPeriod")); DBFactory.init(driverClass, host, user, password, partitionCount, maxWait, idleMaxAge, idleConnectionTestPeriod); } } else { throw new RuntimeException("初始化失敗...."); } list = rootEle.elements("search"); if(list.size() > 0) { Element searchEle = (Element) list.get(0); String indexPath = searchEle.elementText("indexPath"); //索引文件的存放位置 String searcNum = searchEle.elementText("searcNum"); //一次搜索結果數 String resultNum = searchEle.elementText("resultNum"); String recommendNetIndexPath = searchEle.elementText("recommendNetIndexPath"); System.setProperty("searcNum", searcNum); System.setProperty("resultNum", resultNum); System.setProperty("indexFilePath", indexPath); System.setProperty("recommendNetIndexPath", recommendNetIndexPath); BlogIndex.buildIndex(recommendNetIndexPath); } else { throw new RuntimeException("初始化失敗...."); } log.info("初始化搜索....."); BlogSearch.init(); } catch (DocumentException e) { log.error("解析配置文件出錯.....",e); } catch(Exception e) { log.error("出現未知錯誤....",e); } } }
四。util包中的Constant常量類
package com.blog.util; public class Constant { public static final Integer searcNum = Integer.parseInt(System.getProperty("searcNum")); public static final Integer resultNum = Integer.parseInt(System.getProperty("resultNum")); }
util包中的DataToJson類:
package com.blog.util; import java.util.List; import com.google.gson.JsonArray; import com.google.gson.JsonObject; public class DataToJson { public static String parseDataToJson(List<Long> ids, int totalCount) { JsonObject json = new JsonObject(); json.addProperty("totalCount", totalCount); JsonArray array = new JsonArray(); if(ids.size() > 0) { for(Long id : ids) { JsonObject obj = new JsonObject(); obj.addProperty("id", id); array.add(obj); } } json.add("data", array); return json.toString(); } }
五。entity包中的實體類:
Dashboard:
package com.blog.search.entity; public class Dashboard { private Long id; private String content; private String title; public Long getId() { return id; } public void setId(Long id) { this.id = id; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } }
六,lucene相關的索引和檢索類:
index包中的BlogIndex:
package com.blog.search.index; import java.io.File; import java.io.IOException; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import com.blog.search.entity.Dashboard; public class BlogIndex { private static final String indexFilePath = System.getProperty("indexFilePath"); private static Logger log = Logger.getLogger(BlogIndex.class); public BlogIndex() { } //這種方法在沒有索引的時候須要在初始化時調用 public static void buildIndex(String path) { File file = new File(path); if(file.isDirectory() && file.listFiles().length == 0){ Directory dir; try { dir = FSDirectory.open(new File(path)); Analyzer analyzer = new IKAnalyzer(true); //配置類 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); iwc.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, iwc); writer.deleteAll(); writer.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } @SuppressWarnings("deprecation") private Document getDocument(Dashboard dashboard) throws Exception { Document doc = new Document(); doc.add(new Field("title", dashboard.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("content", dashboard.getContent(),Field.Store.NO,Field.Index.ANALYZED)); Field idField = new StringField("id",dashboard.getId().toString(), Field.Store.YES); doc.add(idField); return doc; } public void writeToIndex(Dashboard dashboard) throws Exception { Document doc = getDocument(dashboard); IndexWriter writer = null; try { Directory dir = FSDirectory.open(new File(indexFilePath)); //分析器 Analyzer analyzer = new IKAnalyzer(true); //配置類 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); writer = new IndexWriter(dir, iwc); } catch(Exception e) { e.printStackTrace(); } writer.addDocument(doc); writer.commit(); writer.close(); } public void deleteIndex(Long id) { IndexWriter writer = null; try { Directory dir = FSDirectory.open(new File(indexFilePath)); Analyzer analyzer = new IKAnalyzer(true); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); writer = new IndexWriter(dir, iwc); writer.deleteDocuments(new Term("id",id.toString())); writer.commit(); } catch(Exception e) { log.error("刪除索引出錯....."); } finally { if(writer != null) { try { writer.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } public void updateIndex(Dashboard dashboard) throws Exception { Document doc = getDocument(dashboard); IndexWriter writer = null; try { Directory dir = FSDirectory.open(new File(indexFilePath)); //分析器 Analyzer analyzer = new IKAnalyzer(true); //配置類 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); //iwc.setOpenMode(OpenMode.CREATE); writer = new IndexWriter(dir, iwc); } catch(Exception e) { e.printStackTrace(); } writer.updateDocument(new Term("id", dashboard.getId().toString()), doc); writer.commit(); writer.close(); } }
七,search包以下的BlogSearch類:
package com.blog.search; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser.Operator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import com.blog.util.Constant; import com.blog.util.DataToJson; public class BlogSearch { private static Logger log = Logger.getLogger(BlogSearch.class); private static final String indexFilePath = System.getProperty("indexFilePath"); private static String[] field = {"title","content"}; private IndexSearcher searcher; //存儲初始化的IndexReader,節省每次又一次打開索引文件的性能開銷 private static Map<String, IndexReader> readers = new ConcurrentHashMap<String, IndexReader>(); private static Object lock = new Object(); public static void init() { try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexFilePath))); readers.put("blogsearch", reader); log.info(readers.toString()); } catch (IOException e) { log.error("初始化搜索器出錯.......",e); } } public TopDocs search(String keyword) { try { Analyzer analyzer = new IKAnalyzer(true); QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_43, field,analyzer); parser.setDefaultOperator(Operator.AND); // 將關鍵字包裝成Query對象 Query query = parser.parse(keyword); //加鎖為了防止在一個線程讀取IndexReader之後。可是還沒有運行查詢之前。索引改變了, //導致IndexReader對象被關閉後又一次創建,可能導致關閉異常的問題 synchronized(lock) { IndexReader reader = readers.get("blogsearch"); IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader)reader); if(newReader == null) { //假設為空。表示索引沒有變化 newReader = reader; } else { readers.put("blogsearch", newReader); reader.close(); } searcher = new IndexSearcher(newReader); } //newReader = DirectoryReader.open(FSDirectory.open(new File(indexFilePath))); TopDocs results = searcher.search(query, Constant.resultNum); return results; } catch(Exception e) { log.error("搜索關鍵字出錯......",e); return null; } } public String getResult(String keyword, int pageSize) { TopDocs td = search(keyword); int totalCount = td.totalHits; ScoreDoc[] h = td.scoreDocs; List<Long> ids = new ArrayList<Long>(h.length); if(h.length == 0) { log.debug("no result data"); } else { int start = Constant.searcNum*(pageSize - 1); int end = Constant.searcNum*pageSize; if(start >= totalCount) { start = 0; end = totalCount; } if(end > totalCount) { end = totalCount; } for(int i = start; i < end; i++) { try { Document doc = searcher.doc(h[i].doc); ids.add(Long.parseLong(doc.get("id"))); //log.debug("這是第" + (i + 1) + "個檢索到的結果,id為:" + doc.get("id")+", " + doc.get("title")); } catch(Exception e) { e.printStackTrace(); log.error("start=" +start + ", end=" + end + ", " + h.length); } } } return DataToJson.parseDataToJson(ids, totalCount); } }
八。service包下的BlogSearchService,這是jersey的入口,由這個類向外界提供api:
package com.blog.search.service; import javax.ws.rs.FormParam; import javax.ws.rs.GET; import javax.ws.rs.POST; import javax.ws.rs.Path; import javax.ws.rs.Produces; import javax.ws.rs.QueryParam; import javax.ws.rs.core.MediaType; import com.blog.search.BlogSearch; import com.blog.search.entity.Dashboard; import com.blog.search.index.BlogIndex; import com.google.gson.JsonObject; @Path("/blogSearch/") public class BlogSearchService { @GET @Path("/queryByKeyword") @Produces(MediaType.APPLICATION_JSON) public String queryIdsByKeyword(@QueryParam("keyword") String keyword, @QueryParam("pageSize") Integer pageSize) { return new BlogSearch().getResult(keyword, pageSize); } @POST @Path("/buildByContent") @Produces(MediaType.APPLICATION_JSON) public String buildIndexByContent(@FormParam("content") String content,@FormParam("title")String title, @FormParam("id") Long id) { BlogIndex bi = new BlogIndex(); Dashboard dashboard = new Dashboard(); dashboard.setContent(content); dashboard.setTitle(title); dashboard.setId(id); JsonObject json = new JsonObject(); try { bi.writeToIndex(dashboard); json.addProperty("result", "200"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); json.addProperty("result", "500"); } finally { //index(); return json.toString(); } } @POST @Path("/deleteById") @Produces(MediaType.APPLICATION_JSON) public String deleteIndexById(@FormParam("id") Long id) { BlogIndex bi = new BlogIndex(); JsonObject json = new JsonObject(); try { bi.deleteIndex(id); json.addProperty("result", 200); } catch(Exception e) { json.addProperty("result", 500); } finally { return json.toString(); } } @POST @Path("/update") @Produces(MediaType.APPLICATION_JSON) public String updateIndex(@FormParam("id") Long id, @FormParam("content") String content, @FormParam("title") String title) { BlogIndex bi = new BlogIndex(); JsonObject json = new JsonObject(); try { Dashboard dashboard = new Dashboard(); dashboard.setContent(content); dashboard.setTitle(title); dashboard.setId(id); bi.updateIndex(dashboard); json.addProperty("result", 200); } catch(Exception e) { json.addProperty("result", 500); } finally { return json.toString(); } } }
九,web.xml的配置:
<?xml version="1.0" encoding="UTF-8"?
> <web-app version="2.5" xmlns="http://java.sun.com/xml/ns/javaee" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd"> <display-name></display-name> <welcome-file-list> <welcome-file>index.jsp</welcome-file> </welcome-file-list> <servlet> <servlet-name>JerseyServlet</servlet-name> <servlet-class> com.sun.jersey.spi.container.servlet.ServletContainer </servlet-class> <init-param> <param-name>com.sun.jersey.config.property.packages</param-name> <!-- 系統啟動時掃描的包的路徑--> <param-value>com.blog.search.service</param-value> </init-param> <load-on-startup>1</load-on-startup> </servlet> <servlet-mapping> <servlet-name>JerseyServlet</servlet-name> <url-pattern>/search/*</url-pattern> </servlet-mapping> <listener> <listener-class>com.blog.listener.SystemStartupListener</listener-class> </listener> </web-app>
十,程序依賴包:
self4j-nop-1.7.5.jar
好了。完畢之後,tomcat的配置好之後,假設你是用myeclipse的自帶tomcat公布的,則訪問http://localhost:port/項目名稱/search/blogSearch/buildByContent?後面就是參數傳遞,查詢也跟這個url類似
就這樣。我們創建了一個簡單的restful風格的簡單搜索引擎。裏面的配置大家依照自己的需求改改就好
lucene構建restful風格的簡單搜索引擎服務