Hadoop 下Kmeans分析

iamstar · 发表于 2015-7-13 10:33:40

Hadoop下Kmeans的实现

package org.apache.mahout.clustering.syntheticcontrol.kmeans;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;

import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.conversion.InputDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.utils.clustering.ClusterDumper;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

import java.util.Map;

public final class Job extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(Job.class);
private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";

private Job() {
}

public static void main(String[] args) throws Exception {
      if (args.length > 0) {
         log.info("Running with only user-supplied arguments");
         ToolRunner.run(new Configuration(), new Job(), args);
      } else {
         log.info("Running with default arguments");

         Path output = new Path("output");
         Configuration conf = new Configuration();
         HadoopUtil.delete(conf, output);
         new Job().run(conf, new Path("testdata"), output,
            new EuclideanDistanceMeasure(), 6, 0.5, 10);
      }
}

@Override
public int run(String[] args)
      throws IOException, ClassNotFoundException, InstantiationException,
         IllegalAccessException, InterruptedException {
      addInputOption();
      addOutputOption();
      addOption(DefaultOptionCreator.distanceMeasureOption().create());
      addOption(DefaultOptionCreator.numClustersOption().create());
      addOption(DefaultOptionCreator.t1Option().create());
      addOption(DefaultOptionCreator.t2Option().create());
      addOption(DefaultOptionCreator.convergenceOption().create());
      addOption(DefaultOptionCreator.maxIterationsOption().create());
      addOption(DefaultOptionCreator.overwriteOption().create());

      Map argMap = parseArguments(args);

      if (argMap == null) {
         return -1;
      }

      Path input = getInputPath();
      Path output = getOutputPath();
      String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);

      if (measureClass == null) {
         measureClass = SquaredEuclideanDistanceMeasure.class.getName();
      }

      double convergenceDelta = Double.parseDouble(getOption(
                  DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
      int maxIterations = Integer.parseInt(getOption(
                  DefaultOptionCreator.MAX_ITERATIONS_OPTION));

      if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
         HadoopUtil.delete(getConf(), output);
      }

      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
      Class cl = ccl.loadClass(measureClass);
      DistanceMeasure measure = (DistanceMeasure) cl.newInstance();

      if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
         int k = Integer.parseInt(getOption(
                     DefaultOptionCreator.NUM_CLUSTERS_OPTION));
         run(getConf(), input, output, measure, k, convergenceDelta,
            maxIterations);
      } else {
         double t1 = Double.parseDouble(getOption(
                     DefaultOptionCreator.T1_OPTION));
         double t2 = Double.parseDouble(getOption(
                     DefaultOptionCreator.T2_OPTION));
         run(getConf(), input, output, measure, t1, t2, convergenceDelta,
            maxIterations);
      }

      return 0;
}

/**
   * Run the kmeans clustering job on an input dataset using the given the
   * number of clusters k and iteration parameters. All output data will be
   * written to the output directory, which will be initially deleted if it
   * exists. The clustered points will reside in the path
   * /clustered-points. By default, the job expects a file containing
   * equal length space delimited data that resides in a directory named
   * "testdata", and writes output to a directory named "output".
   *
   * @param conf
   *       the Configuration to use
   * @param input
   *       the String denoting the input directory path
   * @param output
   *       the String denoting the output directory path
   * @param measure
   *       the DistanceMeasure to use
   * @param k
   *       the number of clusters in Kmeans
   * @param convergenceDelta
   *       the double convergence criteria for iterations
   * @param maxIterations
   *       the int maximum number of iterations
   */
public void run(Configuration conf, Path input, Path output,
      DistanceMeasure measure, int k, double convergenceDelta,
      int maxIterations)
      throws IOException, InterruptedException, ClassNotFoundException {
      Path directoryContainingConvertedInput = new Path(output,
            DIRECTORY_CONTAINING_CONVERTED_INPUT);
      log.info("Preparing Input");
      InputDriver.runJob(input, directoryContainingConvertedInput,
         "org.apache.mahout.math.RandomAccessSparseVector");
      log.info("Running random seed to get initial clusters");

      Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
      clusters = RandomSeedGenerator.buildRandom(conf,
            directoryContainingConvertedInput, clusters, k, measure);
      log.info("Running KMeans");
      KMeansDriver.run(conf, directoryContainingConvertedInput, clusters,
         output, measure, convergenceDelta, maxIterations, true, false);

      // run ClusterDumper
      ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
                  output, maxIterations), new Path(output, "clusteredPoints"));
      clusterDumper.printClusters(null);
}

/**
   * Run the kmeans clustering job on an input dataset using the given distance
   * measure, t1, t2 and iteration parameters. All output data will be written
   * to the output directory, which will be initially deleted if it exists. The
   * clustered points will reside in the path /clustered-points. By
   * default, the job expects the a file containing synthetic_control.data as
   * obtained from
   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
   * resides in a directory named "testdata", and writes output to a directory
   * named "output".
   *
   * @param conf
   *       the Configuration to use
   * @param input
   *       the String denoting the input directory path
   * @param output
   *       the String denoting the output directory path
   * @param measure
   *       the DistanceMeasure to use
   * @param t1
   *       the canopy T1 threshold
   * @param t2
   *       the canopy T2 threshold
   * @param convergenceDelta
   *       the double convergence criteria for iterations
   * @param maxIterations
   *       the int maximum number of iterations
   * @throws IOException
   * @throws InterruptedException
   * @throws ClassNotFoundException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
public void run(Configuration conf, Path input, Path output,
      DistanceMeasure measure, double t1, double t2, double convergenceDelta,
      int maxIterations)
      throws IOException, InterruptedException, ClassNotFoundException,
         InstantiationException, IllegalAccessException {
      Path directoryContainingConvertedInput = new Path(output,
            DIRECTORY_CONTAINING_CONVERTED_INPUT);
      log.info("Preparing Input");
      InputDriver.runJob(input, directoryContainingConvertedInput,
         "org.apache.mahout.math.RandomAccessSparseVector");
      log.info("Running Canopy to get initial clusters");
      CanopyDriver.run(conf, directoryContainingConvertedInput, output,
         measure, t1, t2, false, false);
      log.info("Running KMeans");
      KMeansDriver.run(conf, directoryContainingConvertedInput,
         new Path(output, Cluster.INITIAL_CLUSTERS_DIR), output, measure,
         convergenceDelta, maxIterations, true, false);

      // run ClusterDumper
      ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
                  output, maxIterations), new Path(output, "clusteredPoints"));
      clusterDumper.printClusters(null);
}

/**
   * Return the path to the final iteration's clusters
   */
private static Path finalClusterPath(Configuration conf, Path output,
      int maxIterations) throws IOException {
      FileSystem fs = FileSystem.get(conf);

      for (int i = maxIterations; i >= 0; i--) {
         Path clusters = new Path(output, "clusters-" + i);

         if (fs.exists(clusters)) {
            return clusters;
         }
      }

      return null;
}
}

账号		自动登录	找回密码
密码			立即注册

Centos6.5×64安装配置openmeetings3.0.3详

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

[经验分享] Hadoop 下Kmeans分析

浏览过的版块

扫码加入运维网微信交流群