Hadoop2源码分析－MapReduce篇

xinghe0 · 发表于 2018-10-30 11:02:03

package cn.hdfs.mapreduce.example;　　
import java.io.BufferedReader;
　　
import java.io.FileReader;
　　
import java.io.IOException;
　　
import java.net.URI;
　　
import java.util.ArrayList;
　　
import java.util.HashSet;
　　
import java.util.List;
　　
import java.util.Set;
　　
import java.util.StringTokenizer;
　　
import org.apache.hadoop.conf.Configuration;
　　
import org.apache.hadoop.fs.Path;
　　
import org.apache.hadoop.io.IntWritable;
　　
import org.apache.hadoop.io.Text;
　　
import org.apache.hadoop.mapreduce.Job;
　　
import org.apache.hadoop.mapreduce.Mapper;
　　
import org.apache.hadoop.mapreduce.Reducer;
　　
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
　　
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
　　
import org.apache.hadoop.mapreduce.Counter;
　　
import org.apache.hadoop.util.GenericOptionsParser;
　　
import org.apache.hadoop.util.StringUtils;
　　
/*** @date Apr 17, 2015
　　
*
　　
* @author dengjie*/
　　
public class WordCount2 {
　　
public static class TokenizerMapper extends Mapper{
　　
      static enum CountersEnum { INPUT_WORDS }
　　
      private final static IntWritable one = new IntWritable(1);
　　
      private Text word = new Text();
　　
      private boolean caseSensitive;
　　
      private Set patternsToSkip = new HashSet();
　　
      private Configuration conf;
　　
      private BufferedReader fis;
　　

　　
@Override public void setup(Context context) throws IOException,
　　
      InterruptedException {
　　
   conf = context.getConfiguration();
　　
   caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);    if (conf.getBoolean("wordcount.skip.patterns", true)) {
　　
      URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();       for (URI patternsURI : patternsURIs) {
　　
      Path patternsPath = new Path(patternsURI.getPath());
　　
      String patternsFileName = patternsPath.getName().toString();
　　
      parseSkipFile(patternsFileName);
　　
      }
　　
   }
　　
} private void parseSkipFile(String fileName) {    try {
　　
      fis = new BufferedReader(new FileReader(fileName));
　　
      String pattern = null;       while ((pattern = fis.readLine()) != null) {
　　
      patternsToSkip.add(pattern);
　　
      }
　　
   } catch (IOException ioe) {
　　
      System.err.println("Caught exception while parsing the cached file '"
　　
         + StringUtils.stringifyException(ioe));
　　
   }
　　
}
　　

　　
@Override public void map(Object key, Text value, Context context
　　
                  ) throws IOException, InterruptedException {
　　
   String line = (caseSensitive) ?
　　
      value.toString() : value.toString().toLowerCase();    for (String pattern : patternsToSkip) {
　　
      line = line.replaceAll(pattern, "");
　　
   }
　　
   StringTokenizer itr = new StringTokenizer(line);    while (itr.hasMoreTokens()) {
　　
      word.set(itr.nextToken());
　　
      context.write(word, one);
　　
      Counter counter = context.getCounter(CountersEnum.class.getName(),
　　
         CountersEnum.INPUT_WORDS.toString());
　　
      counter.increment(1);
　　
   }
　　
}
　　
  }  public static class IntSumReducer    extends Reducer { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable values,
　　
                     Context context
　　
                     ) throws IOException, InterruptedException {    int sum = 0;    for (IntWritable val : values) {
　　
      sum += val.get();
　　
   }
　　
   result.set(sum);
　　
   context.write(key, result);
　　
}
　　
  }  public static void main(String[] args) throws Exception {
　　
Configuration conf = new Configuration();
　　
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
　　
String[] remainingArgs = optionParser.getRemainingArgs(); if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
　　
   System.err.println("Usage: wordcount [-skip skipPatternFile]");
　　
   System.exit(2);
　　
}
　　
Job job = Job.getInstance(conf, "word count");
　　
job.setJarByClass(WordCount2.class);
　　
job.setMapperClass(TokenizerMapper.class);
　　
job.setCombinerClass(IntSumReducer.class);
　　
job.setReducerClass(IntSumReducer.class);
　　
job.setOutputKeyClass(Text.class);
　　
job.setOutputValueClass(IntWritable.class);
　　

　　
List otherArgs = new ArrayList(); for (int i=0; i < remainingArgs.length; ++i) {    if ("-skip".equals(remainingArgs)) {
　　
      job.addCacheFile(new Path(remainingArgs[++i]).toUri());
　　
      job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
　　
   } else {
　　
      otherArgs.add(remainingArgs);
　　
   }
　　
}
　　
FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
　　
FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));
　　

　　
System.exit(job.waitForCompletion(true) ? 0 : 1);
　　
  }
　　
}

账号		自动登录	找回密码
密码			立即注册

Centos6.5×64安装配置openmeetings3.0.3详

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

[经验分享] Hadoop2源码分析－MapReduce篇

扫码加入运维网微信交流群