设为首页 收藏本站
查看: 579|回复: 0

[经验分享] Apache SOLR and Carrot2 integration strategies 2

[复制链接]

尚未签到

发表于 2016-12-16 09:44:52 | 显示全部楼层 |阅读模式
  In order to use custom chinese tokenizer(eg. jcseg). Following the next steps
  1. download carrot2 souce code  and import it to eclipse
  #git clone git://github.com/carrot2/carrot2.git
  #cd carrot2
  #ant -p
  #ant eclipse
  2. import jecseg to eclipse and reference it to carrot2-util-text subproject.
  
DSC0000.png
 
  3.  modify org.carrot2.text.linguistic.DefaultTokenizerFactory.java

private static EnumMap<LanguageCode, IFactory<ITokenizer>> createDefaultTokenizers() {
EnumMap<LanguageCode, IFactory<ITokenizer>> map = Maps
.newEnumMap(LanguageCode.class);
// By default, we use our own tokenizer for all languages.
IFactory<ITokenizer> whitespaceTokenizerFactory = new NewClassInstanceFactory<ITokenizer>(
ExtendedWhitespaceTokenizer.class);
IFactory<ITokenizer> chineseTokenizerFactory = new NewClassInstanceFactory<ITokenizer>(
InokChineseTokenizerAdapter.class);
for (LanguageCode lc : LanguageCode.values()) {
map.put(lc, whitespaceTokenizerFactory);
}
// Chinese and Thai are exceptions, we use adapters around tokenizers
// from Lucene.
map.put(LanguageCode.CHINESE_SIMPLIFIED, chineseTokenizerFactory);
.....
}
  4. create new class org.carrot2.text.linguistic.lucene.InokChineseTokenizerAdapter.java

package org.carrot2.text.linguistic.lucene;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.util.MutableCharArray;
import org.lionsoul.jcseg.analyzer.JcsegFilter;
import org.lionsoul.jcseg.analyzer.JcsegTokenizer;
import org.lionsoul.jcseg.core.ADictionary;
import org.lionsoul.jcseg.core.DictionaryFactory;
import org.lionsoul.jcseg.core.ISegment;
import org.lionsoul.jcseg.core.IWord;
import org.lionsoul.jcseg.core.JcsegException;
import org.lionsoul.jcseg.core.JcsegTaskConfig;
import org.lionsoul.jcseg.core.SegmentFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class InokChineseTokenizerAdapter extends Tokenizer implements
ITokenizer {
private final static Logger logger = LoggerFactory
.getLogger(InokChineseTokenizerAdapter.class);
private ISegment segmentor;
private OffsetAttribute offsetAtt;
private CharTermAttribute termAtt = null;
private final MutableCharArray tempCharSequence;
public InokChineseTokenizerAdapter() throws JcsegException, IOException {
super(new StringReader(""));
JcsegTaskConfig config = new JcsegTaskConfig();
ADictionary dic = DictionaryFactory.createDefaultDictionary(config);
this.tempCharSequence = new MutableCharArray(new char[0]);
segmentor = SegmentFactory.createJcseg(1, new Object[] { config, dic });
segmentor.reset(input);
termAtt = addAttribute(CharTermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset();
segmentor.reset(reader);
}
@Override
public short nextToken() throws IOException {
final boolean hasNextToken = incrementToken();
if (hasNextToken) {
short flags = 0;
final char[] image = termAtt.buffer();
final int length = termAtt.length();
tempCharSequence.reset(image, 0, length);
if (length == 1) {
flags = ITokenizer.TT_PUNCTUATION;
} else {
flags = ITokenizer.TT_TERM;
}
return flags;
}
return ITokenizer.TT_EOF;
}
@Override
public void setTermBuffer(MutableCharArray array) {
// TODO Auto-generated method stub
array.reset(termAtt.buffer(), 0, termAtt.length());
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
IWord word = segmentor.next();
if (word != null) {
termAtt.append(word.getValue());
termAtt.setLength(word.getLength());
offsetAtt.setOffset(word.getPosition(),
word.getPosition() + word.getLength());
return true;
} else {
end();
return false;
}
}
}

  5. recompile and build jars in carrot2
  #cd carrot2
  a. modify build.xml  to add jcseg jars

<patternset id="lib.test">
<include name="core/**/*.jar" />
<include name="lib/**/*.jar" />
<include name="lib/jcseg-*.jar" />
<exclude name="lib/org.slf4j/slf4j-nop*" />
<include name="applications/carrot2-dcs/**/*.jar" />
<include name="applications/carrot2-webapp/lib/*.jar" />
<include name="applications/carrot2-benchmarks/lib/*.jar" />
</patternset>


  <patternset id="lib.core">
<include name="lib/**/*.jar" />
<include name="core/carrot2-util-matrix/lib/*.jar" />
<include name="lib/jcseg-*.jar" />
<patternset refid="lib.core.excludes" />
</patternset>


  <patternset id="lib.core.mini">
<include name="lib/**/mahout-*.jar" />
<include name="lib/jcseg-*.jar" />
<include name="lib/**/mahout.LICENSE" />
<include name="lib/**/colt.LICENSE" />
<include name="lib/**/commons-lang*" />
<include name="lib/**/guava*" />
<include name="lib/**/jackson*" />
<include name="lib/**/lucene-snowball*" />
<include name="lib/**/lucene.LICENSE" />
<include name="lib/**/hppc-*.jar" />
<include name="lib/**/hppc*.LICENSE" />
<include name="lib/**/slf4j-api*.jar" />
<include name="lib/**/slf4j-nop*.jar" />
<include name="lib/**/slf4j.LICENSE" />
<include name="lib/**/attributes-binder-*.jar" />
</patternset>
  Note:     lib/jcseg-*.jar
  b. cp jcseg-analyzer-1.9.5.jar  and jcseg-core-1.9.5.jar to carrot2/lib/
  c.run recompile and build jar
  #ant jar
  d. cp tmp/jar/carrot2-core-3.10.0-SNAPSHOT.jar   to solr/WEB-INF/lib/ '
  Note: you should copy jars in contrib/clustering/lib/ , jcesg jars, lexcion dir and  jcseg.properties file to solr/WEB-INF/lib/.
  Warning: the most important configure in solrconfig.xml is to define tokenizerFactory attribute
  <str name="PreprocessingPipeline.tokenizerFactory">org.carrot2.text.linguistic.DefaultTokenizerFactory</str>

  <searchComponent name="clustering"
enable="true"
class="solr.clustering.ClusteringComponent" >
<lst name="engine">
<str name="name">lingo</str>
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
<str name="carrot.resourcesDir">clustering/carrot2</str>
<str name="MultilingualClustering.defaultLanguage">CHINESE_SIMPLIFIED</str>
<str name="PreprocessingPipeline.tokenizerFactory">org.carrot2.text.linguistic.DefaultTokenizerFactory</str>
</lst>
</searchComponent>

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-315016-1-1.html 上篇帖子: 基于Solr的地理位置搜索(3) 下篇帖子: Solr的扩展(Scaling)以及性能调优
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表