hadoop 简单实现的一些代码集锦（可以运行）

woyoudn · 发表于 2015-7-14 07:57:21

　　该部分代码采用的hadoop 0.21.0的库，使用的API都是最新的
　　1.实现最简单的wordcount，这个部分map函数的key输入应该设置为longwritable类型，object类型就不够规范了

View Code

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class wordcount
{
public static class TokenizerMapper extends Mapper
{
      private final static IntWritable one=new IntWritable(1);
      private Text word=new Text();
      //map
      public void map(Object key,Text value,Context context) throws IOException,InterruptedException
      {
         //split the string to words
         StringTokenizer itr=new StringTokenizer(value.toString());
         while(itr.hasMoreTokens()){
            word.set(itr.nextToken());//push the split's word to the word class
            context.write(word,one);
         }
      }
}
public static class IntSumReduce extends Reducer {
      private IntWritable result=new IntWritable();
      //reduce
      public void reduce(Text key,Iterable values,Context context) throws IOException,InterruptedException
      {
         int sum=0;
         for(IntWritable val:values){
            sum+=val.get();
         }
         result.set(sum);
         context.write(key, result);
      }
}

public static void main(String[] args) throws Exception{
      // TODO Auto-generated method stub
      Configuration conf=new Configuration();
      //parameters
      String[] otherArgs=new GenericOptionsParser(conf,args).getRemainingArgs();
      if(otherArgs.length!=2){
         System.err.println("Usage:wordcount");
         System.exit(2);
      }
      Job job=new Job(conf,"word count");
      job.setJarByClass(wordcount.class);
      job.setMapperClass(TokenizerMapper.class);
      job.setCombinerClass(IntSumReduce.class);
      job.setReducerClass(IntSumReduce.class);
      job.setOutputKeyClass(Text.class);
      //set output value
      job.setOutputValueClass(IntWritable.class);
      //set the path of input file
      FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
      //set the path of output file
      FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
      //submit the task and wait until it's end
      System.exit(job.waitForCompletion(true)?0:1);
}
}
　　2.实现简单的line Indexer，该部分代码实现的功能即是将输入文件中出现的word分词出来，并且对每个word输出其所在的行，以及该行的文本
　　注意：这个程序对中文的实现以及带数字的文本实现并不好

View Code

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
//会出现问题“Type mismatch in key from map” 是因为我的winscp软件的汉化作者尼玛的英文水平太他妈弱了
public class lineIndexer {
public static  class LineIndexerMapper extends Mapper{
      private Text word = new Text();
      private Text summary = new Text();
      //  ->
      @Override
      public void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
         String line = value.toString();
         StringTokenizer itr = new StringTokenizer(line.toLowerCase());
         summary.set(key.toString() + ":" + line);
         while (itr.hasMoreTokens()) {
            word.set(itr.nextToken());
            context.write(word, summary);
         }
      }
}
public static class LineIndexerReducer extends Reducer{
      @Override
      public void reduce(Text key, Iterable values, Context context)
            throws IOException, InterruptedException {
         boolean first = true;
         StringBuilder str = new StringBuilder();
         Iterator itr = values.iterator();
         while(itr.hasNext()){
            if(!first){
                  str.append('^');
            }
            first=false;
            str.append(itr.next().toString());
         }
         context.write(key, new Text(str.toString()));
      }
}
public static void main(String[] args) throws Exception {
      Configuration conf = new Configuration();
      String[] otherArgs = new GenericOptionsParser(conf, args)
            .getRemainingArgs();
      if (otherArgs.length != 2) {
         System.err.println("Usage: LineIndexer  ");
         System.exit(2);
      }
      Job job;
      try {
         job = new Job(conf, "Line Indexer");
         job.setJarByClass(lineIndexer.class);
         job.setMapperClass(LineIndexerMapper.class);
         //????comment it or not?
         job.setCombinerClass(LineIndexerReducer.class);
         job.setReducerClass(LineIndexerReducer.class);
         job.setOutputKeyClass(Text.class);
         job.setOutputValueClass(Text.class);
         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
         System.exit(job.waitForCompletion(true) ? 0 : 1);
      } catch (InterruptedException ie) {
      } catch (ClassNotFoundException e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
      } catch (IOException e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
      }
}
}
　　3.文档倒排索引，就是将输入文档中的单词统计出来，并输出每个单词出现的文档名以及偏移量，输出结果如下所示
　　bird doc3@0
blue doc2@9
fish doc2@9;doc2@0;doc1@9;doc1@0
one doc1@0;doc3@0
red doc3@0;doc2@0
two doc1@9

View Code

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class invert {
public static class InvertedIndexMapper extends Mapper
{
      protected void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException
      {
         //default RecordReader:LineRecordReader;key:line offset;value;line string
         FileSplit fileSplit=(FileSplit)context.getInputSplit();
         String fileName=fileSplit.getPath().getName();
         Text word=new Text();
         Text fileName_lineOffset=new Text(fileName+"@"+key.toString());
         StringTokenizer itr=new StringTokenizer(value.toString());
         for(;itr.hasMoreTokens();){
            word.set(itr.nextToken());
            context.write(word,fileName_lineOffset);
         }
      }
}
public static class InvertedIndexReducer extends Reducer{
      protected void reduce(Text key,Iterablevalues,Context context) throws IOException,InterruptedException{
         Iterator it=values.iterator();
         StringBuilder all=new StringBuilder();
         if(it.hasNext())
         {
            all.append(it.next().toString());
         }
         for(;it.hasNext();)
         {
            all.append(";");
            all.append(it.next().toString());
         }
         context.write(key, new Text(all.toString()));
      }
}
public static void main(String[] args) {
      // TODO Auto-generated method stub
      try{
         Configuration conf=new Configuration();
         Job job = new Job(conf,"invert index");
         job.setJarByClass(invert.class);
         job.setInputFormatClass(TextInputFormat.class);
         job.setMapperClass(InvertedIndexMapper.class);
         job.setReducerClass(InvertedIndexReducer.class);
         job.setOutputKeyClass(Text.class);
         job.setOutputValueClass(Text.class);
         FileInputFormat.addInputPath(job,new Path(args[0]));
         FileOutputFormat.setOutputPath(job,new Path(args[1]));
         System.exit(job.waitForCompletion(true)?0:1);
      }catch(Exception e){
         e.printStackTrace();
      }
}
}
　　

　　

账号		自动登录	找回密码
密码			立即注册

Centos6.5×64安装配置openmeetings3.0.3详

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

[经验分享] hadoop 简单实现的一些代码集锦（可以运行）

扫码加入运维网微信交流群