bestjoe 发表于 2017-12-16 09:36:34

mongodb mapredReduce 多个条件分组(group by)

  from:https://my.oschina.net/chiyong/blog/289138
  Mongodb 没有传统数据库的group函数,如果分组需要走MapReduce。这种MR与HadoopMR类似。下面看看Mongodb 的分组实现
  现在又一张 表它的数据格式如下:
  {
  "_id" : ObjectId("53b224e0a1ae72328a57702c"),
  "title" : "SECJ0118E",
  "criteria" : "未找到对应的错误码",
  "actual" : "1",
  "effect" : "可能引起重大问题",
  "suggestion" : "请专家提供意见",
  "severity" : "Normal",
  "status" : "NotOK",
  "rtype" : "FormLoginExte",
  "comment" : " 0000001e FormLoginExte E SECJ0118E: Authentication error during authentication for user rpt",
  "category" : "logs",
  "time" : "0008-02-12 17:28:21"
  }
  {
  "_id" : ObjectId("53b224e0a1ae72328a577052"),
  "title" : "",
  "criteria" : "未找到对应的错误码",
  "actual" : "1",
  "effect" : "可能引起重大问题",
  "suggestion" : "请专家提供意见",
  "severity" : "Normal",
  "status" : "NotOK",
  "rtype" : "servlet",
  "comment" : " 0000000b servlet E com.ibm.ws.webcontainer.servlet.ServletWrapper init Uncaught.init.exception.thrown.by.servlet",
  "category" : "logs",
  "time" : "0008-02-12 19:04:41"
  }
  {
  "_id" : ObjectId("53b224e0a1ae72328a576fdc"),
  "title" : "系统资源设置",
  "criteria" : "unlimited",
  "actual" : "unlimited",
  "effect" : "如果对用户资源做了limits限制,有可能造成应用运行失败或系统性能下降。",
  "suggestion" : "建议修改/etc/security/limits,编辑root相关参数部分都为-1。",
  "severity" : "None",
  "status" : "OK",
  "rtype" : "系统参数设置检查",
  "comment" : "",
  "category" : "params"
  }
  1:单个条件分组
  (1) 现在我们根据title进行分组 并且统计每个组的数量
  db.runCommand({ mapreduce: "check_result",
  map : function Map() {
  //emit 函数中的key是唯一的,是分组条件 这里把title传递过来。
  emit(
  this.title,
  {count: 1}
  );
  },
  reduce : function Reduce(key, values) {
  total=0;//定义一个变量total , values是一个数组
  for( var i in values){
  total +=values.count
  }
  return {"count":total};
  },
  finalize : function Finalize(key, reduced) {
  return reduced;
  },
  out : { inline : 1 }
  });
  结果如下(结果中的_id键就是要分组的title 。value是分组后的值):
  { "_id" : "" , "value" : { "count" : 113.0}}
  { "_id" : "/tmp是否设置了t标志位" , "value" : { "count" : 21.0}}
  { "_id" : "ASYN0080W" , "value" : { "count" : 120.0}}
  { "_id" : "AppServer的JVM堆最大值" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的JVM堆最小值" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的JVM标准输出日志切换周期" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的JVM标准输出日志回滚类型" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的JVM标准错误日志切换周期" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的JVM标准错误日志回滚类型" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的WebContainer线程池最大值" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的WebContainer线程池最小值" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的通用JVM参数" , "value" : { "count" : 6.0}}
  { "_id" : "AppServer的通用JVM参数-SystemGC" , "value" : { "count" : 6.0}}
  { "_id" : "Audit是否打开" , "value" : { "count" : 21.0}}
  { "_id" : "CWPKI0041W" , "value" : { "count" : 65.0}}
  { "_id" : "CWPMC0017W" , "value" : { "count" : 7.0}}
  { "_id" : "CWSAA0037W" , "value" : { "count" : 13.0}}
  { "_id" : "Could not invoke an operation on object" , "value" : { "count" : 21.0}}
  { "_id" : "DCSV0000W" , "value" : { "count" : 4.0}}
  { "_id" : "DCSV1115W" , "value" : { "count" : 137.0}}
  2:多个条件分组
  (1) 现在我们根据title,status,severity进行分组 并且统计每个组的数量
  db.runCommand({ mapreduce: "check_result",
  map : function Map() {
  //emit 函数中的key是唯一的,是分组条件
  emit(
  {"title":this.title,"status":this.status,"serverity":this.severity}
  ,
  {count: 1}
  );
  },
  reduce : function Reduce(key, values) {
  total=0;//定义一个变量total , values是一个数组
  for( var i in values){
  total +=values.count
  }
  return {"count":total};
  },
  finalize : function Finalize(key, reduced) {
  return reduced;
  },
  out : { inline : 1 }
  });
  输出结果如下格式化:
  { "_id" : { "title" : "" , "status" : "NotOK"} , "value" : { "count" : 113.0}}
  { "_id" : { "title" : "/tmp是否设置了t标志位" , "status" : "NotOK"} , "value" : { "count" : 21.0}}
  { "_id" : { "title" : "ASYN0080W" , "status" : "NotOK"} , "value" : { "count" : 120.0}}
  { "_id" : { "title" : "AppServer的JVM堆最大值" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的JVM堆最小值" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的JVM标准输出日志切换周期" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的JVM标准输出日志回滚类型" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的JVM标准错误日志切换周期" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的JVM标准错误日志回滚类型" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的WebContainer线程池最大值" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的WebContainer线程池最小值" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的通用JVM参数" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "AppServer的通用JVM参数-SystemGC" , "status" : "NotOK"} , "value" : { "count" : 6.0}}
  { "_id" : { "title" : "Audit是否打开" , "status" : "NotOK"} , "value" : { "count" : 21.0}}
  { "_id" : { "title" : "CWPKI0041W" , "status" : "NotOK"} , "value" : { "count" : 65.0}}
  { "_id" : { "title" : "CWPMC0017W" , "status" : "NotOK"} , "value" : { "count" : 7.0}}
  { "_id" : { "title" : "CWSAA0037W" , "status" : "NotOK"} , "value" : { "count" : 13.0}}
  { "_id" : { "title" : "Could not invoke an operation on object" , "status" : "NotOK"} , "value" : { "count" : 21.0}}
  { "_id" : { "title" : "DCSV0000W" , "status" : "NotOK"} , "value" : { "count" : 4.0}}
  { "_id" : { "title" : "DCSV1115W" , "status" : "NotOK"} , "value" : { "count" : 137.0}}
页: [1]
查看完整版本: mongodb mapredReduce 多个条件分组(group by)