不启动Solr,使用Solr的analyzer chain (使用mmseg4j分词)
class MyAnalyzer {def analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String s) {
def loader = new ClasspathResourceLoader()
// create tokenizer
def factory = new MMSegTokenizerFactory(["mode": "complex", "dicPath": "dict"])
factory.inform(loader)
Tokenizer tokenizer = factory.create()
// create tokenfilters
factory = new SynonymFilterFactory(["synonyms": "dict/synonyms.txt", "expand": "true", "ignoreCase": "true"])
factory.inform(loader)
TokenFilter filter = factory.create(tokenizer)
factory = new StopFilterFactory(["ignoreCase": "true", "words": "dict/stop_words_cn.txt"])
factory.inform(loader)
filter = factory.create(filter)
return new TokenStreamComponents(tokenizer, filter)
}
}
def tokenize(String text) {
def tokens = []
def ts = analyzer.tokenStream("text", text)
def termAttr = ts.addAttribute(CharTermAttribute.class)
ts.reset()
while (ts.incrementToken()) {
tokens.add(termAttr.toString())
}
ts.end()
ts.close()
return tokens
}
public static void main(String[] args) {
MyAnalyzer analyzer = new MyAnalyzer()
println(analyzer.tokenize("我是一个粉刷匠"))
}
}
页:
[1]