hadoop worldcount小程序

mofdan · 发表于 2017-12-17 07:36:35

import java.io.File;　　

import java.io.IOException;　　

import java.net.URI;　　

import java.net.URISyntaxException;　　

　　
import org.apache.hadoop.conf.Configuration;
　　
import org.apache.hadoop.fs.FileSystem;
　　
import org.apache.hadoop.fs.Path;
　　
import org.apache.hadoop.io.LongWritable;
　　
import org.apache.hadoop.io.Text;
　　
import org.apache.hadoop.mapreduce.Job;
　　
import org.apache.hadoop.mapreduce.Mapper;
　　
import org.apache.hadoop.mapreduce.Reducer;
　　
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
　　
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
　　

　　
public>　　

　　
   static final String INPUT_PATH = "hdfs://masters:9000/user/hadoop/input";
　　
   static final String OUTPUT_PATH = "hdfs://masters:9000/user/hadoop/output";

　　
   public static void main(String[] args) throws IOException,>　　

　　
      //添加以下的代码，就可以联通，不知道咋回事
　　
      String path = new File(".").getCanonicalPath();
　　
      System.getProperties().put("hadoop.home.dir", path);
　　
      new File("./bin").mkdirs();
　　
      new File("./bin/winutils.exe").createNewFile();
　　

　　
      Configuration conf = new Configuration();
　　
      Path outpath = new Path(OUTPUT_PATH);
　　

　　
      Job job = new Job(conf, "WorldCount");
　　

　　
      FileInputFormat.setInputPaths(job, INPUT_PATH);
　　
      FileOutputFormat.setOutputPath(job, outpath);
　　

　　
      //检测输出路径是否存在，如果存在就删除，否则会报错
　　
      FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);
　　
      if(fileSystem.exists(outpath)){
　　
         fileSystem.delete(outpath, true);
　　
      }
　　

　　
      job.setMapperClass(MyMapper.class);
　　
      job.setReducerClass(MyReducer.class);
　　
      job.setOutputKeyClass(Text.class);
　　
      job.setOutputValueClass(LongWritable.class);
　　
      job.waitForCompletion(true);
　　
   }
　　

　　
   //输入，map，即拆分过程

　　
   static>　　

　　
      /*
　　
      * 输入为（key,value）输出为（value,count数量）
　　
      * 所以LongWritable, Text, Text, LongWritable分别代表 key(行号) value value count
　　
      * 其中LongWritable和Text是hadoop定义的类型，分别代表long和string两种类型
　　
      * */
　　
      protected void map(LongWritable k1, Text v1, Context context)throws IOException, InterruptedException{
　　
         String[] splits = v1.toString().split(" ");//按照空格拆分
　　
         for(String str: splits){
　　
               System.out.println("---" + str);
　　
               context.write(new Text(str), new LongWritable(1));//拆分出来的形式为（“单词”，出现次数（这里默认为1））
　　
         }
　　
      }
　　
   }
　　

　　
   //输出，reduce，汇总过程

　　
   static>　　
      protected void reduce(
　　
               Text k2, //输出的内容，即value
　　
               Iterable<LongWritable> v2s, //是一个longwritable类型的数组，所以用了Iterable这个迭代器，且元素为v2s
　　
               org.apache.hadoop.mapreduce.Reducer<Text, LongWritable, Text, LongWritable>.Context context)
　　
               //这里一定设置好，不然输出会变成单个单词，从而没有统计数量
　　
               throws IOException, InterruptedException {
　　
         //列表求和初始为0
　　
         long times = 0L;
　　
         for(LongWritable count:v2s){
　　
               times += count.get();
　　
         }
　　
         context.write(k2, new LongWritable(times));
　　
      }
　　
   }
　　
}

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] hadoop worldcount小程序

浏览过的版块

扫码加入运维网微信交流群