In order to use custom chinese tokenizer(eg. jcseg). Following the next steps
1. download carrot2 souce code and import it to eclipse
#git clone git://github.com/carrot2/carrot2.git
#cd carrot2
#ant -p
#ant eclipse
2. import jecseg to eclipse and reference it to carrot2-util-text subproject.
private static EnumMap<LanguageCode, IFactory<ITokenizer>> createDefaultTokenizers() {
EnumMap<LanguageCode, IFactory<ITokenizer>> map = Maps
.newEnumMap(LanguageCode.class);
// By default, we use our own tokenizer for all languages.
IFactory<ITokenizer> whitespaceTokenizerFactory = new NewClassInstanceFactory<ITokenizer>(
ExtendedWhitespaceTokenizer.class);
IFactory<ITokenizer> chineseTokenizerFactory = new NewClassInstanceFactory<ITokenizer>(
InokChineseTokenizerAdapter.class);
for (LanguageCode lc : LanguageCode.values()) {
map.put(lc, whitespaceTokenizerFactory);
}
// Chinese and Thai are exceptions, we use adapters around tokenizers
// from Lucene.
map.put(LanguageCode.CHINESE_SIMPLIFIED, chineseTokenizerFactory);
.....
}
4. create new class org.carrot2.text.linguistic.lucene.InokChineseTokenizerAdapter.java
package org.carrot2.text.linguistic.lucene;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.util.MutableCharArray;
import org.lionsoul.jcseg.analyzer.JcsegFilter;
import org.lionsoul.jcseg.analyzer.JcsegTokenizer;
import org.lionsoul.jcseg.core.ADictionary;
import org.lionsoul.jcseg.core.DictionaryFactory;
import org.lionsoul.jcseg.core.ISegment;
import org.lionsoul.jcseg.core.IWord;
import org.lionsoul.jcseg.core.JcsegException;
import org.lionsoul.jcseg.core.JcsegTaskConfig;
import org.lionsoul.jcseg.core.SegmentFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class InokChineseTokenizerAdapter extends Tokenizer implements
ITokenizer {
private final static Logger logger = LoggerFactory
.getLogger(InokChineseTokenizerAdapter.class);
private ISegment segmentor;
private OffsetAttribute offsetAtt;
private CharTermAttribute termAtt = null;
private final MutableCharArray tempCharSequence;
public InokChineseTokenizerAdapter() throws JcsegException, IOException {
super(new StringReader(""));
JcsegTaskConfig config = new JcsegTaskConfig();
ADictionary dic = DictionaryFactory.createDefaultDictionary(config);
this.tempCharSequence = new MutableCharArray(new char[0]);
segmentor = SegmentFactory.createJcseg(1, new Object[] { config, dic });
segmentor.reset(input);
termAtt = addAttribute(CharTermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset();
segmentor.reset(reader);
}
@Override
public short nextToken() throws IOException {
final boolean hasNextToken = incrementToken();
if (hasNextToken) {
short flags = 0;
final char[] image = termAtt.buffer();
final int length = termAtt.length();
tempCharSequence.reset(image, 0, length);
if (length == 1) {
flags = ITokenizer.TT_PUNCTUATION;
} else {
flags = ITokenizer.TT_TERM;
}
return flags;
}
return ITokenizer.TT_EOF;
}
@Override
public void setTermBuffer(MutableCharArray array) {
// TODO Auto-generated method stub
array.reset(termAtt.buffer(), 0, termAtt.length());
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
IWord word = segmentor.next();
if (word != null) {
termAtt.append(word.getValue());
termAtt.setLength(word.getLength());
offsetAtt.setOffset(word.getPosition(),
word.getPosition() + word.getLength());
return true;
} else {
end();
return false;
}
}
}
5. recompile and build jars in carrot2
#cd carrot2
a. modify build.xml to add jcseg jars