public class wordcount
{
public static class TokenizerMapper extends Mapper
{
private final static IntWritable one=new IntWritable(1);
private Text word=new Text();
//map
public void map(Object key,Text value,Context context) throws IOException,InterruptedException
{
//split the string to words
StringTokenizer itr=new StringTokenizer(value.toString());
while(itr.hasMoreTokens()){
word.set(itr.nextToken());//push the split's word to the word class
context.write(word,one);
}
}
}
public static class IntSumReduce extends Reducer {
private IntWritable result=new IntWritable();
//reduce
public void reduce(Text key,Iterable values,Context context) throws IOException,InterruptedException
{
int sum=0;
for(IntWritable val:values){
sum+=val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception{
// TODO Auto-generated method stub
Configuration conf=new Configuration();
//parameters
String[] otherArgs=new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length!=2){
System.err.println("Usage:wordcount");
System.exit(2);
}
Job job=new Job(conf,"word count");
job.setJarByClass(wordcount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReduce.class);
job.setReducerClass(IntSumReduce.class);
job.setOutputKeyClass(Text.class);
//set output value
job.setOutputValueClass(IntWritable.class);
//set the path of input file
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
//set the path of output file
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
//submit the task and wait until it's end
System.exit(job.waitForCompletion(true)?0:1);
}
}
2.实现简单的line Indexer,该部分代码实现的功能即是将输入文件中出现的word分词出来,并且对每个word输出其所在的行,以及该行的文本
注意:这个程序对中文的实现以及带数字的文本实现并不好
View Code
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
//会出现问题“Type mismatch in key from map” 是因为我的winscp软件的汉化作者尼玛的英文水平太他妈弱了
public class lineIndexer {
public static class LineIndexerMapper extends Mapper{
private Text word = new Text();
private Text summary = new Text();
// ->
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line.toLowerCase());
summary.set(key.toString() + ":" + line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, summary);
}
}
}
public static class LineIndexerReducer extends Reducer{
@Override
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
boolean first = true;
StringBuilder str = new StringBuilder();
Iterator itr = values.iterator();
while(itr.hasNext()){
if(!first){
str.append('^');
}
first=false;
str.append(itr.next().toString());
}
context.write(key, new Text(str.toString()));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: LineIndexer ");
System.exit(2);
}
Job job;
try {
job = new Job(conf, "Line Indexer");
job.setJarByClass(lineIndexer.class);
job.setMapperClass(LineIndexerMapper.class);
//????comment it or not?
job.setCombinerClass(LineIndexerReducer.class);
job.setReducerClass(LineIndexerReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (InterruptedException ie) {
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
3.文档倒排索引,就是将输入文档中的单词统计出来,并输出每个单词出现的文档名以及偏移量,输出结果如下所示
bird doc3@0
blue doc2@9
fish doc2@9;doc2@0;doc1@9;doc1@0
one doc1@0;doc3@0
red doc3@0;doc2@0
two doc1@9