shaoqin 发表于 2016-12-14 11:15:12

solr使用中文,庖丁分词

    1、分析器改造 net.paoding.analysis.analyzer.PaodingTokenizer.java
  extends Tokenizer 原来是 extends TokenStream
  2、net.paoding.analysis.analyzer.solr.ChineseTokenizerFactory.java
  package net.paoding.analysis.analyzer.solr;
  import java.io.Reader;
  import java.util.Map;
  import net.paoding.analysis.analyzer.PaodingTokenizer;
  import net.paoding.analysis.analyzer.TokenCollector;
  import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector;
  import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
  import net.paoding.analysis.knife.PaodingMaker;
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.Tokenizer;
  import org.apache.solr.analysis.BaseTokenizerFactory;
  /**
  * Created by IntelliJ IDEA. User: ronghao Date: 2007-11-3 Time: 14:40:59 中文切词
  * 对庖丁切词的封装
  */
  public class ChineseTokenizerFactory extends BaseTokenizerFactory {
  /**
  * 最多切分 默认模式
  */
  public static final String MOST_WORDS_MODE = "most-words";
  /**
  * 按最大切分
  */
  public static final String MAX_WORD_LENGTH_MODE = "max-word-length";
  private String mode = null;
  public void setMode(String mode) {
  if (mode == null || MOST_WORDS_MODE.equalsIgnoreCase(mode)
  || "default".equalsIgnoreCase(mode)) {
  this.mode = MOST_WORDS_MODE;
  } else if (MAX_WORD_LENGTH_MODE.equalsIgnoreCase(mode)) {
  this.mode = MAX_WORD_LENGTH_MODE;
  } else {
  throw new IllegalArgumentException(
  "不合法的分析器Mode                                                参数设置:"
  + mode);
  }
  }
  @Override
  public void init(Map<String,String> args) {
  super.init(args);
  setMode(args.get("mode"));
  }
  public Tokenizer create(Reader input) {
  return new PaodingTokenizer(input, PaodingMaker.make(),
  createTokenCollector());
  }
  private TokenCollector createTokenCollector() {
  if (MOST_WORDS_MODE.equals(mode))
  return new MostWordsTokenCollector();
  if (MAX_WORD_LENGTH_MODE.equals(mode))
  return new MaxWordLengthTokenCollector();
  throw new Error("never happened");
  }
  }
  3、下载庖丁 将paoding-analysis.jar改造成paoding-analysis-solr.jar(将1,2覆盖进去这个包)
  4、定义庖丁词典 paoding-dic-home.properties
  paoding.dic.home=D:/WORK/DOCSET/solr/paoding-analysis-2.0.4-beta/dic/
  5、替换原有的分词类 D:\WORK\DOCSET\solr\j-solr1\solr\conf\schema.xml
  <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">        
  <analyzer type="index">          
  <tokenizer class="net.paoding.analysis.analyzer.solr.ChineseTokenizerFactory" mode="most-words"/>          
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>       
  <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>          
  <filter class="solr.LowerCaseFilterFactory"/>           
  <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>         
  </analyzer>    
  <analyzer type="query">         
  <tokenizer class="net.paoding.analysis.analyzer.solr.ChineseTokenizerFactory" mode="most-words"/>            
  <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>         
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>      
  <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>         
  <filter class="solr.LowerCaseFilterFactory"/>          
  <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>     
  </analyzer> 
  </fieldtype>
页: [1]
查看完整版本: solr使用中文,庖丁分词