solr入门之多线程操作solr中索引字段的解决

kaiser_cn · 发表于 2017-12-19 09:53:56

import java.io.IOException;　　
import java.util.ArrayList;
　　
import java.util.Date;
　　
import java.util.HashMap;
　　
import java.util.List;
　　
import java.util.Map;
　　
import java.util.Map.Entry;
　　
import java.util.Set;
　　

　　
import javax.annotation.Resource;
　　
import javax.management.RuntimeErrorException;
　　

　　
import org.apache.solr.client.solrj.SolrClient;
　　
import org.apache.solr.client.solrj.SolrQuery;
　　
import org.apache.solr.client.solrj.SolrServerException;
　　
import org.apache.solr.client.solrj.response.QueryResponse;
　　
import org.slf4j.Logger;
　　
import org.slf4j.LoggerFactory;
　　
import org.springframework.beans.factory.annotation.Value;
　　
import org.springframework.stereotype.Component;
　　

　　
import cn.com.mx.gome.search.core.util.MD5Utils;
　　
import cn.com.mx.gome.search.core.util.Pinyin4jUtil;
　　
import cn.com.mx.gome.search.core.util.SetToStringArrUtils;
　　
import cn.com.mx.gome.search.quote.digger.solrbean.ProductBean;
　　
import cn.com.mx.gome.suggest.cache.WordCountCache;
　　
import cn.com.mx.gome.suggest.domian.BaseSuggestIndex;
　　
import cn.com.mx.gome.suggest.solr.SolrServiceWrapper;
　　

　　
/**
　　* 商品类搜索建议推荐词导入
　　* @author songqinghu
　　*
　　*/
　　
@Component("fullIndexItemSuggestProcess")

　　
public>　　

　　private Logger logger = LoggerFactory.getLogger(FullIndexItemSuggestProcess.class);
　　

　　//这里注入一个工厂来生产指定的数据源获取类来获取不同的数据源类--先放着
　　

　　//solr连接
　　@Resource
　　private SolrServiceWrapper solrServiceWrapperImpl;
　　

　　@Value("${SORL_PRODUCT_NAME}") //从 properties 文件里注入 solr连接的名称
　　private String solrProductName;
　　

　　@Value("${SORL_SUGGEST_NAME}")
　　private String solrSuggestName;
　　

　　private  SolrClient itemClient;
　　

　　private SolrClient suggestClient;
　　

　　@Override
　　public List<BaseSuggestIndex> getBeans(int skip, int limit) {//这里在传一个參数用来确定数据导入方式
　　Map<String, Integer> words ;
　　if(false){
　　//words = getDataBySQL(skip, limit); //从原始数据库中获取
　　}else {
　　words= getDataBySolr(skip, limit);//从solr中获取
　　}
　　

　　List<BaseSuggestIndex> assembleBeans = AssembleBeans(words);
　　

　　return assembleBeans;
　　}
　　/**
　　*
　　* @描写叙述：从solr索引库中获取数据
　　* @param skip 開始
　　* @param limit 步距
　　* @return
　　* @return List<Product>
　　* @exception
　　* @createTime：2016年3月25日
　　* @author: songqinghu
　　*/
　　private Map<String,Integer> getDataBySolr(int skip, int limit) {
　　//查询指定范围内的数据
　　SolrQuery query = new SolrQuery();
　　query.set("q", "*:*");
　　query.setStart(skip);
　　query.setRows(limit);
　　QueryResponse response;
　　Map<String,Integer> words = null;
　　try {
　　response = itemClient.query(query);
　　

　　logger.info("skip="+skip+",limit="+limit+",获取到索引数  : "+response.getResults().getNumFound());
　　

　　List<ProductBean> productBeans = response.getBeans(ProductBean.class);
　　//数据处理部分---我要将获取到的数据中的须要的三个字段的汉字然后进行记录出现次数和去反复
　　words = suggestCollect(productBeans);
　　}catch (SolrServerException | IOException  e) {
　　logger.error("FullIndexItemSuggestProcess  getDataBySolr :" + e);
　　}
　　return words;
　　}
　　//将获取到的数据中的须要的三个字段的汉字然后进行记录出现次数和去反复
　　private Map<String, Integer> suggestCollect(List<ProductBean> productBeans) {
　　

　　Map<String, Integer> words = new HashMap<String,Integer>();
　　

　　for (ProductBean productBean : productBeans) { //这里须要考虑词语切分问题---不太会处理
　　

　　String name = productBean.getName(); //商品名称
　　mapOperation(words, name);
　　List<String> cateNames = productBean.getCateName();//类目名称
　　for (String cateName : cateNames) {
　　mapOperation(words, cateName);
　　}
　　String spuBrand = productBean.getSpuBrand();//品牌名称
　　mapOperation(words, spuBrand);
　　

　　}
　　return words;
　　}
　　/**
　　* 对map中数据进行操作
　　*/
　　private void mapOperation(Map<String, Integer> words,String name){
　　if(words.containsKey(name)){ //存在就加一
　　Integer count = words.get(name);
　　words.put(name, count+1);
　　}else{//不存在就存入
　　words.put(name, 1);
　　}
　　}
　　

　　//对原始查询后的集合数据进行拆分
　　public List<BaseSuggestIndex> AssembleBeans(Map<String,Integer> words){
　　ArrayList<BaseSuggestIndex> baseSuggestIndexs = new ArrayList<BaseSuggestIndex>();
　　Set<Entry<String, Integer>> entrySet = words.entrySet();
　　for (Entry<String, Integer> entry : entrySet) {
　　BaseSuggestIndex assembleBean = AssembleBean(entry);
　　baseSuggestIndexs.add(assembleBean);
　　}
　　

　　return baseSuggestIndexs;
　　}
　　//组装单个 solr文档对象
　　public BaseSuggestIndex AssembleBean(Entry<String, Integer> entry){
　　

　　BaseSuggestIndex baseIndex = new BaseSuggestIndex();
　　

　　String word = entry.getKey();
　　

　　Set<String> shortpy = Pinyin4jUtil.converterToFirstSpellToSet(word);
　　

　　Set<String> allpy = Pinyin4jUtil.converterToSpellToSet(word);
　　

　　baseIndex.setWord(word);
　　

　　baseIndex.setShort_py(SetToStringArrUtils.convertToStringArr(shortpy));
　　

　　baseIndex.setAll_py(SetToStringArrUtils.convertToStringArr(allpy));
　　//这里须要设置一下使用md5加密算法来保证每一个字符串相应的id唯一  涉及到分类问题可能会出现反复加入分类

　　String>　　baseIndex.setSuggestId(id);
　　

　　baseIndex.setType("product");
　　

　　baseIndex.setCreateTime(new Date().getTime());
　　//这里须要特别注意了--涉及到多线程问题了 ---当查询已经存在的词语的时候查到次数加上当前的次数---
　　//存在问题--怎样处理线程顺序保证多个线程是有序的操作呢?---存入映射关系  结束后再次进行次数更新?
　　//这里必须要处理下和实际的数据差距太大了了!!!!!!!!!!!!!
　　logger.info("词名: "+word + " 本轮次数 :"+ entry.getValue());
　　baseIndex.setCount(getCount4Word(id, entry.getValue()));
　　

　　return baseIndex;
　　}
　　/**
　　*
　　* @描写叙述：获取该词语的数量加上眼下的词语数量---首次从索引库中获取  后缓存到本地中以后从本地中获取
　　* 这里还是不太准确  尽管保证了词语出现次数通过加锁可是线程提交时是无法进行控制的还是存在误差的

　　* @param>　　* @param count 本轮词语出现的次数
　　* @return
　　* @return Integer
　　* @exception
　　* @createTime：2016年3月25日
　　* @author: songqinghu
　　*/

　　private Integer getCount4Word(String>　　

　　return WordCountCache.putCount(id, count);
　　}
　　

　　

　　

　　/**
　　*
　　* @描写叙述：通过数据库来获取数据源
　　* @param skip
　　* @param limit
　　* @return
　　* @return List<Product>
　　* @exception
　　* @createTime：2016年3月25日
　　* @author: songqinghu
　　*/
　　private Map<String, Integer> getDataBySQL(int skip, int limit) {
　　

　　return null;
　　}
　　

　　

　　@Override
　　public long getMaxNum() {//这里也须要依据数据源来配置下
　　if(itemClient == null){
　　itemClient = solrServiceWrapperImpl.getCollection(solrProductName);
　　}
　　SolrQuery query = new SolrQuery();
　　query.set("q", "*:*");
　　long maxNum = 0;
　　try {
　　maxNum = itemClient.query(query).getResults().getNumFound();
　　} catch (SolrServerException | IOException  e) {
　　logger.error("FullIndexItemSuggestProcess  getMaxNum :" + e);
　　}
　　logger.info(new Date()+"  最大值: "+maxNum);
　　return maxNum;
　　}
　　

　　@Override
　　public boolean isEnd(int skip, int limit, long maxNum) {
　　

　　//開始坐标要是小于最大数量就继续---要不要事实更新呢?时时更新吧
　　

　　return skip < getMaxNum() ? true : false;
　　}
　　/**
　　* 获取词语相应的索引id
　　*/
　　@Override
　　public String getId(String word) {
　　if(word ==null){
　　throw new RuntimeException("id不能为空");
　　}
　　return "product_"+MD5Utils.MD5(word);
　　}
　　

　　
}
　　

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] solr入门之多线程操作solr中索引字段的解决

浏览过的版块

扫码加入运维网微信交流群