#apt-get install mysql
2.添加Handler
编辑/opt/solr-tomcat/solr/conf文件夹下的solrconfig.xml文件,在config元素中添加
< requestHandler name = "/dataimport" class = "org.apache.solr.handler.dataimport.DataImportHandler" > < lst name = "defaults" > < str name = "config" > data-config.xml </ str > </ lst > </ requestHandler >
3. 在此文件夹中新建一个data-config.xml文件,内容如下
<dataConfig><dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver"url="jdbc:mysql://127.0.0.1:3306/db_expert" user="root" password="root"/><document><entity name="expert" query="select id,experName,researchfield,title,workunit,postaladdress,officephone,email,biography from expertinfo"></entity></document></dataConfig>
4. 修改schema.xml,找到<fieldType name="text",将分词器修改为中文分词器,这里用的是包装过的Paoding分词,这个东西好像已经不更新了,以后看看IKAnalyzer吧。
<fieldType name="text" class="solr.TextField" positionIncrementGap="100"><analyzer type="index"><tokenizer class="net.paoding.analysis.analyzer.ChineseTokenizerFactory" mode="most-words"/><!-- in this example, we will only use synonyms at query time<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>--><!-- Case insensitive stop word removal.add enablePositionIncrements=true in both the index and queryanalyzers to leave a 'gap' for more accurate phrase queries.--><filter class="solr.StopFilterFactory"ignoreCase="true"words="stopwords.txt"enablePositionIncrements="true"/><filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/><filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/><filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> <analyzer type="query"> <tokenizer class="net.paoding.analysis.analyzer.ChineseTokenizerFactory" mode="most-words"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.StopFilterFactory"ignoreCase="true"words="stopwords.txt"enablePositionIncrements="true"/><filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/><filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/></analyzer></fieldType>
原来的schema.xml中没有的字段自己添加上。schema.xml默认是UTF-8编码。首先在<fields></fields>里添加要索引的域。
注意:数据库的表的主键的名字最好是id,以后才好办。
<field name="id" type="int" indexed="true" stored="true" required="true" /> <field name="experName" type="text" indexed="true" stored="true"/><field name="researchfield" type="text" indexed="true" stored="true"/><field name="title" type="text" indexed="true" stored="true"/><field name="workunit" type="text" indexed="true" stored="true"/><field name="postaladdress" type="text" indexed="true" stored="true"/><field name="officephone" type="text" indexed="true" stored="true"/><field name="email" type="text" indexed="true" stored="true"/><field name="biography" type="text" indexed="true" stored="true"/>
然后在<solrQueryParser defaultOperator="OR"/>下面添加下面行。
<copyField source="experName" dest="text" /><copyField source="researchfield" dest="text" /><copyField source="title" dest="text" /><copyField source="workunit" dest="text" /><copyField source="postaladdress" dest="text" /><copyField source="officephone" dest="text" /><copyField source="email" dest="text" /><copyField source="biography" dest="text" />
5. 包装Paoding的分词器
1. package net.paoding.analysis.analyzer; 2. 3. import java.io.Reader; 4. import java.util.Map; 5. 6. import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector; 7. import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector; 8. import net.paoding.analysis.knife.PaodingMaker; 9. 10. import org.apache.lucene.analysis.Tokenizer; 11. import org.apache.solr.analysis.BaseTokenizerFactory; 12. 13. /** 14. * Created by IntelliJ IDEA. 15. * User: ronghao 16. * Date: 2007-11-3 17. * Time: 14:40:59 中文切词 对庖丁切词的封装 18. */ 19. public class ChineseTokenizerFactory extends BaseTokenizerFactory { 20. /** 21. * 最多切分 默认模式 22. */ 23. public static final String MOST_WORDS_MODE = "most-words" ; 24. /** 25. * 按最大切分 26. */ 27. public static final String MAX_WORD_LENGTH_MODE = "max-word-length" ; 28. 29. private String mode = null ; 30. 31. public void setMode(String mode) { 32. if (mode == null || MOST_WORDS_MODE.equalsIgnoreCase(mode) || "default" .equalsIgnoreCase(mode)) { 33. this .mode = MOST_WORDS_MODE; 34. } else if (MAX_WORD_LENGTH_MODE.equalsIgnoreCase(mode)) { 35. this .mode = MAX_WORD_LENGTH_MODE; 36. } else { 37. throw new IllegalArgumentException( "不合法的分析器Mode参数设置:" + mode); 38. } 39. } 40. 41. @Override 42. public void init(Map<String, String> args) { 43. super .init(args); 44. setMode(args.get("mode" )); 45. } 46. 47. public Tokenizer create(Reader input) { 48. return new PaodingTokenizer2(input, PaodingMaker.make(), createTokenCollector()); 49. } 50. 51. private TokenCollector createTokenCollector() { 52. if (MOST_WORDS_MODE.equals(mode)) 53. return new MostWordsTokenCollector(); 54. if (MAX_WORD_LENGTH_MODE.equals(mode)) 55. return new MaxWordLengthTokenCollector(); 56. throw new Error( "never happened" ); 57. } 58. } Java代码 收藏代码1. package net.paoding.analysis.analyzer; 2. 3. import java.io.Reader; 4. import java.util.Map; 5. 6. import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector; 7. import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector; 8. import net.paoding.analysis.knife.PaodingMaker; 9. 10. import org.apache.lucene.analysis.Tokenizer; 11. import org.apache.solr.analysis.BaseTokenizerFactory; 12. 13. /** 14. * Created by IntelliJ IDEA. 15. * User: ronghao 16. * Date: 2007-11-3 17. * Time: 14:40:59 中文切词 对庖丁切词的封装 18. */ 19. public class ChineseTokenizerFactory extends BaseTokenizerFactory { 20. /** 21. * 最多切分 默认模式 22. */ 23. public static final String MOST_WORDS_MODE = "most-words"; 24. /** 25. * 按最大切分 26. */ 27. public static final String MAX_WORD_LENGTH_MODE = "max-word-length"; 28. 29. private String mode = null; 30. 31. public void setMode(String mode) { 32. if (mode == null || MOST_WORDS_MODE.equalsIgnoreCase(mode) || "default".equalsIgnoreCase(mode)) { 33. this.mode = MOST_WORDS_MODE; 34. } else if (MAX_WORD_LENGTH_MODE.equalsIgnoreCase(mode)) { 35. this.mode = MAX_WORD_LENGTH_MODE; 36. } else { 37. throw new IllegalArgumentException("不合法的分析器Mode参数设置:" + mode); 38. } 39. } 40. 41. @Override 42. public void init(Map<String, String> args) { 43. super.init(args); 44. setMode(args.get("mode")); 45. } 46. 47. public Tokenizer create(Reader input) { 48. return new PaodingTokenizer(input, PaodingMaker.make(), this.createTokenCollector()); 49. } 50. 51. private TokenCollector createTokenCollector() { 52. if (MOST_WORDS_MODE.equals(mode)) 53. return new MostWordsTokenCollector(); 54. if (MAX_WORD_LENGTH_MODE.equals(mode)) 55. return new MaxWordLengthTokenCollector(); 56. throw new Error("never happened"); 57. } 58. }
这两个修改的类就放在/opt/tomcat/webapps/solr中,在压缩包的WEB-INF文件夹中新建classes文件夹,将这两个类连同包层次复制进去就行。
6.设置PAODING_DIC_HOME的环境变量,指向Paoding词典文件的位置。(比如/opt/dic)
#vim /etc/profile
添加下面的环境变量
JAVA_HOME=/usr/lib/jvm/java-6-openjdk
CLASSPATH=.:/usr/lib/jvm/java-6-openjdk/lib
ANT_HOME=/usr/share/ant
PATH="$JAVA_HOME/lib:$ANT_HOME/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games"
PAODING_DIC_HOME=/opt/dic
export JAVA_HOME
export CLASSPATH
export ANT_HOME
export PATH
export PAODING_DIC_HOME
7. 将Paoding和数据库驱动放到/opt/tomcat/webapps/solr/WEB-INF/lib中。
进入浏览器,运行
http://localhost:8983/solr/dataimport?command=full-import
导入成功后,运行
http://localhost:8983/solr/admin/
在搜索框中输入搜索内容进行查询
现在就完成了数据库的索引了。