Hadoop例子中WordCount参数分析

q36988 · 发表于 2018-10-30 08:59:05

　　package org.apache.hadoop.examples;
　　import java.io.IOException;
　　import java.util.StringTokenizer;
　　import org.apache.hadoop.conf.Configuration;
　　import org.apache.hadoop.fs.Path;
　　import org.apache.hadoop.io.IntWritable;
　　import org.apache.hadoop.io.Text;
　　import org.apache.hadoop.mapreduce.Job;
　　import org.apache.hadoop.mapreduce.Mapper;
　　import org.apache.hadoop.mapreduce.Reducer;
　　import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
　　import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
　　import org.apache.hadoop.util.GenericOptionsParser;

　　public>　　public static>TokenizerMapper
　　extends Mapper{
　　/*
IntWritable是 Hadoop 中实现的用于封装 Java 数据类型的类,它的原型是public IntWritable(int value)和public IntWritable()两种。所以new IntWritable(1)是新建了这个类的一个对象，而数值1这是参数。在Hadoop中它相当于java中Integer整型变量，为这个变量赋值为1.类似于java中Integer i = new Integer(1)。　　
IntWritable实现了Writable的接口。Writable是Hadoop的序列号格式。当要在进程间传递对象或者持久化对象的时候，就需要对象序列号成字节流，，反之当要接受或者从磁盘读取字节流到对象的时候就要进行反序列化。
　　*/
　　private final static IntWritable one = new IntWritable(1);
　　private Text word = new Text();
　　public void map(Object key, Text value, Context context)
　　throws IOException, InterruptedException {
　　StringTokenizer itr = new StringTokenizer(value.toString());
　　while (itr.hasMoreTokens()) {
　　word.set(itr.nextToken());
　　context.write(word, one);
　　}
　　}
　　}
　　public static>IntSumReducer
　　extends Reducer {
　　private IntWritable result = new IntWritable();
　　public void reduce(Text key, Iterable values,Context context)
　　throws IOException, InterruptedException {
　　int sum = 0;
　　for (IntWritable val : values) {
　　sum += val.get();
　　}
　　result.set(sum);
　　context.write(key, result);
　　}
　　}
　　public static void main(String[] args) throws Exception {
　　Configuration conf = new Configuration();
　　String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
　　if (otherArgs.length != 2) {
　　System.err.println("Usage: wordcount ");
　　System.exit(2);
　　}
　　Job job = new Job(conf, "word count");
　　job.setJarByClass(WordCount.class);
　　job.setMapperClass(TokenizerMapper.class);
　　job.setCombinerClass(IntSumReducer.class);
　　job.setReducerClass(IntSumReducer.class);
　　job.setOutputKeyClass(Text.class);
　　job.setOutputValueClass(IntWritable.class);
　　FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
　　FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
　　System.exit(job.waitForCompletion(true) ? 0 : 1);
　　}
　　}

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

Red Hat RHCE 8 (EX294) Cert Guide

c++ size_t 和 int 的区别

HERE 使用 AWS EF 和 JFrog Artifactory 打

C++ 指针大全：从基础到进阶，一篇快速上手

wirelessnetview好用的无线分析工具

亿图图示专家(EDraw Max) V7.9 中文破解版

[经验分享] Hadoop例子中WordCount参数分析

浏览过的版块

扫码加入运维网微信交流群