今天需要将一个以逗号分隔的字段建立到索引库中去,没找到有现成的逗号分隔符分词器,于是看了看源码里空格分词器WhitespaceTokenizerFactory的写法。照葫芦画瓢写了一个逗号分词器:package com.besttone.analyzer;
import java.io.Reader;
import java.util.Map;
import org.apache.solr.analysis.BaseTokenizerFactory;
public class CommaTokenizerFactory extends BaseTokenizerFactory {
@Override
public void init(Map<String, String> args) {
super.init(args);
assureMatchVersion();
}
public CommaTokenizer create(Reader input) {
return new CommaTokenizer(luceneMatchVersion, input);
}
}
package com.besttone.analyzer;
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
public class CommaTokenizer extends CharTokenizer {
/**
* Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
* to match See {@link <a href="#version">above</a>}
*
* @param in
* the input to split up into tokens
*/
public CommaTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
/**
* Construct a new WhitespaceTokenizer using a given {@link AttributeSource}
* .
*
* @param matchVersion
* Lucene version to match See
* {@link <a href="#version">above</a>}
* @param source
* the attribute source to use for this {@link Tokenizer}
* @param in
* the input to split up into tokens
*/
public CommaTokenizer(Version matchVersion, AttributeSource source,
Reader in) {
super(matchVersion, source, in);
}
/**
* Construct a new WhitespaceTokenizer using a given
* {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
*
* @param matchVersion
* Lucene version to match See
* {@link <a href="#version">above</a>}
* @param factory
* the attribute factory to use for this {@link Tokenizer}
* @param in
* the input to split up into tokens
*/
public CommaTokenizer(Version matchVersion, AttributeFactory factory,
Reader in) {
super(matchVersion, factory, in);
}
/**
* Construct a new CommaTokenizer.
*
* @deprecated use {@link #CommaTokenizer(Version, Reader)} instead. This
* will be removed in Lucene 4.0.
*/
@Deprecated
public CommaTokenizer(Reader in) {
super(in);
}
/**
* Construct a new CommaTokenizer using a given {@link AttributeSource}.
*
* @deprecated use {@link #CommaTokenizer(Version, AttributeSource, Reader)}
* instead. This will be removed in Lucene 4.0.
*/
@Deprecated
public CommaTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
/**
* Construct a new CommaTokenizer using a given
* {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
*
* @deprecated use
* {@link #CommaTokenizer(Version, AttributeSource.AttributeFactory, Reader)}
* instead. This will be removed in Lucene 4.0.
*/
@Deprecated
public CommaTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/**
* Collects only characters which do not satisfy
* {@link Character#isWhitespace(int)}.
*/
@Override
protected boolean isTokenChar(int c) {
// return !Character.isWhitespace(c);
// 44表示逗号
return !(c == 44);
}
}