ElasticSearch學習筆記-同義詞記錄
阿新 • • 發佈:2019-02-14
import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; public class IKTokenizer extends Tokenizer { private IKSegmenter _IKImplement = null; private final CharTermAttribute termAtt; private final OffsetAttribute offsetAtt; private final TypeAttribute typeAtt; private int endPosition; public IKTokenizer(boolean useSmart) { offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); _IKImplement = new IKSegmenter(input, useSmart); } public IKTokenizer(Reader in, boolean useSmart) { offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); _IKImplement = new IKSegmenter(input, useSmart); } @Override public boolean incrementToken() throws IOException { clearAttributes(); Lexeme nextLexeme = _IKImplement.next(); if (nextLexeme != null) { termAtt.append(nextLexeme.getLexemeText()); termAtt.setLength(nextLexeme.getLength()); offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); endPosition = nextLexeme.getEndPosition(); typeAtt.setType(nextLexeme.getLexemeTypeString()); return true; } return false; } @Override public void reset() throws IOException { super.reset(); _IKImplement.reset(input); } @Override public final void end() { int finalOffset = correctOffset(this.endPosition); offsetAtt.setOffset(finalOffset, finalOffset); } }
import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SynonymFilterFactory; import org.apache.lucene.analysis.util.ClasspathResourceLoader; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class IKSynonymsAnalyzer extends Analyzer { private Logger LOG = LoggerFactory.getLogger(IKSynonymsAnalyzer.class); private Boolean useSmart = null; public IKSynonymsAnalyzer() { } public IKSynonymsAnalyzer(boolean useSmart) { this.useSmart = useSmart; } @Override protected TokenStreamComponents createComponents(String fieldName) { Map<String, String> filterArgs = new HashMap<String, String>(); filterArgs.put("synonyms", "elastic/synonyms_1.txt,elastic/synonyms_2.txt"); filterArgs.put("luceneMatchVersion", Version.LUCENE_5_5_2.toString()); filterArgs.put("expand", "true"); SynonymFilterFactory factory = new SynonymFilterFactory(filterArgs); try { factory.inform(new ClasspathResourceLoader()); } catch (IOException e) { LOG.error(e.getMessage(), e); } Tokenizer tokenizer = null == useSmart ? new WhitespaceTokenizer() : new IKTokenizer(useSmart); return new TokenStreamComponents(tokenizer, factory.create(tokenizer)); } }
import java.io.StringReader; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.cisiondata.modules.elastic.analyzer.IKSynonymsAnalyzer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; public class ElasticUtils { private static Logger LOG = LoggerFactory.getLogger(ElasticUtils.class); private static Analyzer ikanalyzer = new IKSynonymsAnalyzer(); /** * 分詞 * @param input * @param userSmart true 用智慧分詞 false 細粒度分詞 * @return */ public static String[] analyze(String input, boolean userSmart) { List<String> results = new ArrayList<String>(); try { IKSegmenter ikSeg = new IKSegmenter(new StringReader(input.trim()), userSmart); for (Lexeme lexeme = ikSeg.next(); lexeme != null; lexeme = ikSeg.next()) { results.add(lexeme.getLexemeText()); } } catch (Exception e) { LOG.error(e.getMessage(), e); } return results.toArray(new String[0]); } public static String[] convertSynonyms(String input) { return convertSynonyms(ikanalyzer, input); } /** * 同義詞匹配,返回TokenStream */ public static String[] convertSynonyms(Analyzer analyzer, String input) { Set<String> results = new HashSet<String>(); TokenStream tokenStream = analyzer.tokenStream("fields", input); CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { results.add(termAttribute.toString()); } tokenStream.end(); tokenStream.close(); } catch (Exception e) { LOG.error(e.getMessage(), e); } return results.toArray(new String[0]); } }
BoolQueryBuilder boolQueryBuilder = new BoolQueryBuilder();
String[] keywords = ElasticUtils.convertSynonyms(valueString);
for (int i = 0, len = keywords.length; i < len; i++) {
boolQueryBuilder.should(QueryBuilders.matchPhraseQuery(name, keywords[i]));
}