Hadoop：第二个程序操作HDFS

manf · 发表于 2015-7-13 08:04:33

本代码包含功能：获取DataNode名，并写入到HDFS文件系统中的文件hdfs:///copyOftest.c中。
并计数文件hdfs:///copyOftest.c中的wordcount计数，有别于Hadoop的examples中的读取本地文件系统中的文件，这次读取的是HDFS中的文件。

package com.fora;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class FileOperate {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
      init();/*初始化文件*/
      Configuration conf = new Configuration();
      Job job = new Job(conf, "word count");
      job.setJarByClass(FileOperate.class);
      job.setMapperClass(TokenizerMapper.class);
      job.setCombinerClass(IntSumReducer.class);
      job.setReducerClass(IntSumReducer.class);
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(IntWritable.class);
            /* set the path of input and output*/
      FileInputFormat.addInputPath(job, new Path("hdfs:///copyOftest.c"));
      FileOutputFormat.setOutputPath(job, new Path("hdfs:///wordcount"));
      System.exit(job.waitForCompletion(true) ? 0 : 1);
   }
public static class TokenizerMapper
extends Mapper{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()){
   word.set(itr.nextToken());
   context.write(word, one);
}
}
}
public static class IntSumReducer
   extends Reducer {
  private IntWritable result = new IntWritable();
  public void reduce(Text key, Iterable values, Context context)
      throws IOException, InterruptedException{
int sum = 0;
for (IntWritable val : values){
   sum += val.get();
}
result.set(sum);
context.write(key, result);
  }
}
public static void init()throws IOException {
      /*copy local file to hdfs*/
      Configuration config = new Configuration();
      FileSystem hdfs = null;
      String  srcFile = "/test.c";
      String  dstFile = "hdfs:///copyOftest.c";
      System.out.print("copy success!\n");
hdfs = FileSystem.get(config);
      Path srcPath = new Path(srcFile);
      Path dstPath = new Path(dstFile);
      hdfs.copyFromLocalFile(srcPath, dstPath);
      String fileName = "hdfs:///copyOftest.c";
      Path path = new Path(fileName);
      FileStatus fileStatus =null;
      fileStatus = hdfs.getFileStatus(path);
      System.out.println(fileStatus.getBlockSize());
      FileSystem fs = FileSystem.get(config);
      DistributedFileSystem hdfs1 = (DistributedFileSystem) fs;
      DatanodeInfo[] dataNodeStats = hdfs1.getDataNodeStats();
      /*create a file on hdfs*/
      Path Outputpath = new Path("hdfs:///output/listOfDatanode");
      FSDataOutputStream outputStream = hdfs.create(Outputpath);
      String[] names = new String[dataNodeStats.length];
      for (int i = 0; i < dataNodeStats.length; i++) {
            names = dataNodeStats.getHostName();/*get the list of datanodes*/
                        System.out.println(names);
            /*write the list of datanodes to file on hdfs*/
            outputStream.write(names.getBytes(), 0, names.length());
      }
}
}

运行结果：
[iyunv@master bin]# hadoop jar HDFS.jar com.fora.FileOperate
copy success!
67108864
master
slave1
11/07/21 15:45:23 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
11/07/21 15:45:23 INFO input.FileInputFormat: Total input paths to process : 1
11/07/21 15:45:23 INFO mapred.JobClient: Running job: job_201107210917_0003
11/07/21 15:45:24 INFO mapred.JobClient:  map 0% reduce 0%
11/07/21 15:45:31 INFO mapred.JobClient:  map 100% reduce 0%
11/07/21 15:45:43 INFO mapred.JobClient:  map 100% reduce 100%
11/07/21 15:45:45 INFO mapred.JobClient: Job complete: job_201107210917_0003
11/07/21 15:45:45 INFO mapred.JobClient: Counters: 17
11/07/21 15:45:45 INFO mapred.JobClient: Job Counters
11/07/21 15:45:45 INFO mapred.JobClient:    Launched reduce tasks=1
11/07/21 15:45:45 INFO mapred.JobClient:    Rack-local map tasks=1
11/07/21 15:45:45 INFO mapred.JobClient:    Launched map tasks=1
11/07/21 15:45:45 INFO mapred.JobClient: FileSystemCounters
11/07/21 15:45:45 INFO mapred.JobClient:    FILE_BYTES_READ=228
11/07/21 15:45:45 INFO mapred.JobClient:    HDFS_BYTES_READ=126
11/07/21 15:45:45 INFO mapred.JobClient:    FILE_BYTES_WRITTEN=488
11/07/21 15:45:45 INFO mapred.JobClient:    HDFS_BYTES_WRITTEN=146
11/07/21 15:45:45 INFO mapred.JobClient: Map-Reduce Framework
11/07/21 15:45:45 INFO mapred.JobClient:    Reduce input groups=19
11/07/21 15:45:45 INFO mapred.JobClient:    Combine output records=19
11/07/21 15:45:45 INFO mapred.JobClient:    Map input records=8
11/07/21 15:45:45 INFO mapred.JobClient:    Reduce shuffle bytes=228
11/07/21 15:45:45 INFO mapred.JobClient:    Reduce output records=19
11/07/21 15:45:45 INFO mapred.JobClient:    Spilled Records=38
11/07/21 15:45:45 INFO mapred.JobClient:    Map output bytes=211
11/07/21 15:45:45 INFO mapred.JobClient:    Combine input records=22
11/07/21 15:45:45 INFO mapred.JobClient:    Map output records=22
11/07/21 15:45:45 INFO mapred.JobClient:    Reduce input records=19
[iyunv@master bin]# hadoop dfs  -ls /
Found 6 items
-rw-r--r-- 1 root supergroup       126 2011-07-21 15:45 /copyOftest.c
-rw-r--r-- 1 root supergroup       26 2011-07-21 15:16 /listOfDatanode
drwxr-xr-x - root supergroup       0 2011-07-21 15:45 /output
-rw-r--r-- 1 root supergroup    10400 2011-07-20 16:51 /test.txt
drwxr-xr-x - root supergroup       0 2011-07-20 16:09 /tmp
drwxr-xr-x - root supergroup       0 2011-07-21 15:45 /wordcount
[iyunv@master bin]# hadoop dfs -ls /wordcount
Found 2 items
drwxr-xr-x - root supergroup       0 2011-07-21 15:45 /wordcount/_logs
-rw-r--r-- 1 root supergroup       146 2011-07-21 15:45 /wordcount/part-r-00000
[iyunv@master bin]# hadoop dfs -cat /wordcount/part-r-00000
2011-07-21    1
File 1
Hadoop  1
System! 1
a    1
aimed 1
at    1
coping  1
file 3
from 1
from:fora    1
is    1
local 1
system  1
thank 1
the    1
this 2
to    1
you! 1
[iyunv@master bin]#

　　

账号		自动登录	找回密码
密码			立即注册

Centos6.5×64安装配置openmeetings3.0.3详

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

[经验分享] Hadoop：第二个程序操作HDFS

浏览过的版块

扫码加入运维网微信交流群