1. 程式人生 > >ElasticSearch學習筆記-同義詞記錄

ElasticSearch學習筆記-同義詞記錄

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

public class IKTokenizer extends Tokenizer {

	private IKSegmenter _IKImplement = null;

	private final CharTermAttribute termAtt;
	
	private final OffsetAttribute offsetAtt;
	
	private final TypeAttribute typeAtt;

	private int endPosition;
	
	public IKTokenizer(boolean useSmart) {
		offsetAtt = addAttribute(OffsetAttribute.class);
		termAtt = addAttribute(CharTermAttribute.class);
		typeAtt = addAttribute(TypeAttribute.class);
		_IKImplement = new IKSegmenter(input, useSmart);
	}

	public IKTokenizer(Reader in, boolean useSmart) {
		offsetAtt = addAttribute(OffsetAttribute.class);
		termAtt = addAttribute(CharTermAttribute.class);
		typeAtt = addAttribute(TypeAttribute.class);
		_IKImplement = new IKSegmenter(input, useSmart);
	}

	@Override
	public boolean incrementToken() throws IOException {
		clearAttributes();
		Lexeme nextLexeme = _IKImplement.next();
		if (nextLexeme != null) {
			termAtt.append(nextLexeme.getLexemeText());
			termAtt.setLength(nextLexeme.getLength());
			offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
			endPosition = nextLexeme.getEndPosition();
			typeAtt.setType(nextLexeme.getLexemeTypeString());
			return true;
		}
		return false;
	}

	@Override
	public void reset() throws IOException {
		super.reset();
		_IKImplement.reset(input);
	}

	@Override
	public final void end() {
		int finalOffset = correctOffset(this.endPosition);
		offsetAtt.setOffset(finalOffset, finalOffset);
	}

}
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class IKSynonymsAnalyzer extends Analyzer {
	
	private Logger LOG = LoggerFactory.getLogger(IKSynonymsAnalyzer.class);
	
	private Boolean useSmart = null;
	
	public IKSynonymsAnalyzer() {
		
	}
	
	public IKSynonymsAnalyzer(boolean useSmart) {
		this.useSmart = useSmart;
	}

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
		Map<String, String> filterArgs = new HashMap<String, String>();
		filterArgs.put("synonyms", "elastic/synonyms_1.txt,elastic/synonyms_2.txt");
		filterArgs.put("luceneMatchVersion", Version.LUCENE_5_5_2.toString());
		filterArgs.put("expand", "true");
		SynonymFilterFactory factory = new SynonymFilterFactory(filterArgs);
		try {
			factory.inform(new ClasspathResourceLoader());
		} catch (IOException e) {
			LOG.error(e.getMessage(), e);
		}
		Tokenizer tokenizer = null == useSmart ? new WhitespaceTokenizer() : new IKTokenizer(useSmart);
		return new TokenStreamComponents(tokenizer, factory.create(tokenizer));  
	} 
	
}
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.cisiondata.modules.elastic.analyzer.IKSynonymsAnalyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

public class ElasticUtils {

	private static Logger LOG = LoggerFactory.getLogger(ElasticUtils.class);
	
	private static Analyzer ikanalyzer = new IKSynonymsAnalyzer();
	
	/**
	 * 分詞
	 * @param input
	 * @param userSmart  true 用智慧分詞   false 細粒度分詞
	 * @return
	 */
	public static String[] analyze(String input, boolean userSmart) {
		List<String> results = new ArrayList<String>();
		try {
			IKSegmenter ikSeg = new IKSegmenter(new StringReader(input.trim()), userSmart);
			for (Lexeme lexeme = ikSeg.next(); lexeme != null; lexeme = ikSeg.next()) {
				results.add(lexeme.getLexemeText());
			}
		} catch (Exception e) {
			LOG.error(e.getMessage(), e);
		}
		return results.toArray(new String[0]);
	}
	
	public static String[] convertSynonyms(String input) {
		return convertSynonyms(ikanalyzer, input);
	}

	/**
	 * 同義詞匹配,返回TokenStream
	 */
	public static String[] convertSynonyms(Analyzer analyzer, String input) {
		Set<String> results = new HashSet<String>();
		TokenStream tokenStream = analyzer.tokenStream("fields", input);
		CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
		try {
			tokenStream.reset();
			while (tokenStream.incrementToken()) {
				results.add(termAttribute.toString());
			}
			tokenStream.end();
			tokenStream.close();
		} catch (Exception e) {
			LOG.error(e.getMessage(), e);
		}
		return results.toArray(new String[0]);
	}
	
}
BoolQueryBuilder boolQueryBuilder = new BoolQueryBuilder();
String[] keywords = ElasticUtils.convertSynonyms(valueString);
for (int i = 0, len = keywords.length; i < len; i++) {
boolQueryBuilder.should(QueryBuilders.matchPhraseQuery(name, keywords[i]));
}