|
package cn.edu.ytu.botao.wordcount;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/**
* 单词计数
* @author botao
*
*/
public class WordCount {
/**
* map 过程
* 继承Mapper接口 设置map的输入类型为
* 输出类型为
*/
public static class TokenizerMapper extends Mapper{
//one 表示单词出现一次
private final static IntWritable one=new IntWritable(1);
//word 用于存储切下的单词
private Text word=new Text();
/**
* 重写map()方法
*/
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
//super.map(key, value, context);
//对value(要计数的文件进行单词切割) 进行切分
StringTokenizer tokenizer=new StringTokenizer(value.toString());
//将切割后的单词取出 输出
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
//map 输出格式
context.write(word, one);
}
}
/**
* reduce 过程
* 继承Reducer接口 设置输入类型为 (该输入类型正为 mapper 的输出类型)
* 输出类型为:
*/
public static class SumReducer extends Reducer{
//numResult 记录单词的频数
private IntWritable numResult=new IntWritable();
/**
* 重写reduce()方法
*/
@Override
protected void reduce(Text key, Iterable values,
Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
int sum=0;
//对获取的 计算value的和
for (IntWritable val : values) {
sum+=val.get();
}
//将频数存放到numResult 中
numResult.set(sum);
//收集结果
context.write(key, numResult);
}
}
}
}
|
|
|