fdhfgh 发表于 2016-12-15 08:49:21

solr中使用paoding

  solr版本3.4.0
  paoding版本(Revision154), 下载源代码http://paoding.googlecode.com/svn/trunk/,  到paoding-analysis目录执行build.bat, 生成paoding-analysis.jar

package com.sh2600.test.paoding;
import java.io.Reader;
import java.util.Map;
import net.paoding.analysis.analyzer.PaodingTokenizer;
import net.paoding.analysis.analyzer.TokenCollector;
import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector;
import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
import net.paoding.analysis.knife.PaodingMaker;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.analysis.BaseTokenizerFactory;
public class ChineseTokenizerFactory extends BaseTokenizerFactory {
/**
* 最多切分 默认模式
*/
public static final String MOST_WORDS_MODE = "most-words";
/**
* 按最大切分
*/
public static final String MAX_WORD_LENGTH_MODE = "max-word-length";
private String mode = null;
public void setMode(String mode) {
if (mode == null || MOST_WORDS_MODE.equalsIgnoreCase(mode)
|| "default".equalsIgnoreCase(mode)) {
this.mode = MOST_WORDS_MODE;
} else if (MAX_WORD_LENGTH_MODE.equalsIgnoreCase(mode)) {
this.mode = MAX_WORD_LENGTH_MODE;
} else {
throw new IllegalArgumentException("不合法的分析器Mode参数设置:" + mode);
}
}
@Override
public void init(Map args) {
super.init(args);
setMode(args.get("mode").toString());
}
public Tokenizer create(Reader input) {
return new PaodingTokenizer(input, PaodingMaker.make(),
createTokenCollector());
}
private TokenCollector createTokenCollector() {
if (MOST_WORDS_MODE.equals(mode))
return new MostWordsTokenCollector();
if (MAX_WORD_LENGTH_MODE.equals(mode))
return new MaxWordLengthTokenCollector();
throw new Error("never happened");
}
}

  以上代码打包为paoding4solr.jar, 和paoding-analysis.jar放到$solr.home/lib下
  复制词典(dic目录)到/opt/paoding/dic
  复制paoding-*.properties到classpath(例如tomcat/lib下,  放到$solr.home/lib似乎不行)
  修改paoding-dic-home.properties中的paoding.dic.home=/opt/paoding/dic
  修改$solr.home/conf/schema.xml

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<!--<tokenizer class="solr.WhitespaceTokenizerFactory"/>-->
<tokenizer class="com.sh2600.test.paoding.ChineseTokenizerFactory" mode="most-words"/>
......
</analyzer> 
  <analyzer type="query"> 
<!--<tokenizer class="solr.WhitespaceTokenizerFactory"/>--> 
    <tokenizer class="com.sh2600.test.paoding.ChineseTokenizerFactory" mode="most-words"/>
    ......
</analyzer> 
</fieldType>
   重启solr
  执行数据导入,查询
  参考
  http://tech.ddvip.com/2009-09/1252589447132071.html
  http://www.iyunv.com/topic/364513
页: [1]
查看完整版本: solr中使用paoding