设为首页 收藏本站
查看: 658|回复: 0

[经验分享] 【大数据】Hadoop 成长之路 (二) 利用Hadoop分析气象数据集

[复制链接]

尚未签到

发表于 2018-10-29 12:06:22 | 显示全部楼层 |阅读模式
1.下载气象数据集  
    wget -r -c
  
    查看数据集
  
    [root@hadoop-master 2008]# ls
  
    010030-99999-2008.gz  010231-99999-2008.gz  010460-99999-2008.gz  010570-99999-2008.gz  010881-99999-2008.gz
  
    010070-99999-2008.gz  010260-99999-2008.gz  010490-99999-2008.gz  010750-99999-2008.gz  010883-99999-2008.gz
  
    010150-99999-2008.gz  010330-99999-2008.gz  010520-99999-2008.gz  010780-99999-2008.gz  010890-99999-2008.gz
  
    010230-99999-2008.gz  010450-99999-2008.gz  010550-99999-2008.gz  010830-99999-2008.gz
  

  
2.将数据解压并导入到example文件中
  
    [root@hadoop-master 2008]# zcat *.gz > example
  
    查看文件是否正确
  
    [root@hadoop-master 2008]# tail -10 example
  
    0101010980999992008031013004+70367+031100FM-12+001599999V0201801N006019999999N9999999N1-00081-00291099591ADDMA1999999099411MD1710101+9999REMSYN060AAXX  10131 01098 46/// /1806 11008 21029 39941 49959 57010;
  
    0101010980999992008031014004+70367+031100FM-12+001599999V0201901N006019999999N9999999N1-00071-00241099601ADDMA1999999099411MD1710051+9999REMSYN060AAXX  10141 01098 46/// /1906 11007 21024 39941 49960 57005;
  
    0171010980999992008031015004+70367+031100FM-12+001599999V0202201N004010042019N0060001N1-00151-00261099611ADDAY171031AY221031GF107991061071004501021999MA1999999099431MD1510021+9999MW1221REMSYN082AAXX  10151 01098 41456 72204 11015 21026 39943 49961 55002 72272 8672/ 333 4////;
  
    0101010980999992008031016004+70367+031100FM-12+001599999V0202101N005019999999N9999999N1-00121-00211099581ADDMA1999999099401MD1010011+9999REMSYN060AAXX  10161 01098 46/// /2105 11012 21021 39940 49958 50001;
  
    0101010980999992008031017004+70367+031100FM-12+001599999V0202201N004019999999N9999999N1-00131-00231099591ADDMA1999999099411MD1410001+9999REMSYN060AAXX  10171 01098 46/// /2204 11013 21023 39941 49959 54000;
  
    0213010980999992008031018004+70367+031100FM-12+001599999V0201901N004010042019N0150001N1-00061-00151099601ADDAA112000021AY171061AY221061GF107991051071004501021999KA1120M-00061MA1999999099421MD1510011+9999MW1701REMSYN100AAXX  10181 01098 11465 71904 11006 21015 39942 49960 55001 69912 77072 8572/ 333 11006 4//// 91107;
  
    0101010980999992008031019004+70367+031100FM-12+001599999V0201901N006019999999N9999999N1+00001-00101099591ADDMA1999999099411MD1210011+9999REMSYN060AAXX  10191 01098 46/// /1906 10000 21010 39941 49959 52001;
  
    0101010980999992008031020004+70367+031100FM-12+001599999V0201801N006019999999N9999999N1+00041-00091099621ADDMA1999999099441MD1210031+9999REMSYN060AAXX  10201 01098 46/// /1806 10004 21009 39944 49962 52003;
  
    0171010980999992008031021004+70367+031100FM-12+001599999V0201901N005010042019N0300001N1+00071-00061099621ADDAY171031AY221031GF107991071071004501999999MA1999999099441MD1210021+9999MW1021REMSYN082AAXX  10211 01098 41480 71905 10007 21006 39944 49962 52002 70272 877// 333 4////;
  
    0101010980999992008031022004+70367+031100FM-12+001599999V0201901N005019999999N9999999N1+00091-0004
  

  
3.将数据集导入到hadoop的in目录的test文件中
  
    /root/hadoop-1.1.2/bin/hadoop fs -put ./example ./in/test
  

  
4.编写MapReduce程序
  
    cd /root/hadoop-1.1.2/myclass
  
    [root@hadoop-master myclass]# cat MaxTemperature.java
  
    // cc MaxTemperature Application to find the maximum temperature in the weather dataset
  
    // vv MaxTemperature
  
    import org.apache.hadoop.fs.Path;
  
    import org.apache.hadoop.io.IntWritable;
  
    import org.apache.hadoop.io.Text;
  
    import org.apache.hadoop.mapreduce.Job;
  
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  

  
    public class MaxTemperature {
  

  
      public static void main(String[] args) throws Exception {
  
        if (args.length != 2) {
  
          System.err.println("Usage: MaxTemperature  ");
  
          System.exit(-1);
  
        }
  

  
        Job job = new Job();
  
        job.setJarByClass(MaxTemperature.class);
  
        job.setJobName("Max temperature");
  

  
        FileInputFormat.addInputPath(job, new Path(args[0]));
  
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
  

  
        job.setMapperClass(MaxTemperatureMapper.class);
  
        job.setReducerClass(MaxTemperatureReducer.class);
  

  
        job.setOutputKeyClass(Text.class);
  
        job.setOutputValueClass(IntWritable.class);
  

  
        System.exit(job.waitForCompletion(true) ? 0 : 1);
  
      }
  
    }
  
    // ^^ MaxTemperature
  

  
    [root@hadoop-master myclass]# cat MaxTemperatureMapper.java
  
    // cc MaxTemperatureMapper Mapper for maximum temperature example
  
    // vv MaxTemperatureMapper
  
    import java.io.IOException;
  

  
    import org.apache.hadoop.io.IntWritable;
  
    import org.apache.hadoop.io.LongWritable;
  
    import org.apache.hadoop.io.Text;
  
    import org.apache.hadoop.mapreduce.Mapper;
  

  
    public class MaxTemperatureMapper
  
      extends Mapper {
  

  
      private static final int MISSING = 9999;
  

  
      @Override
  
      public void map(LongWritable key, Text value, Context context)
  
          throws IOException, InterruptedException {
  

  
        String line = value.toString();
  
        String year = line.substring(15, 19);
  
        int airTemperature;
  
        if (line.charAt(87) == '+') { // parseInt doesn't like leading plus signs
  
          airTemperature = Integer.parseInt(line.substring(88, 92));
  
        } else {
  
          airTemperature = Integer.parseInt(line.substring(87, 92));
  
        }
  
        String quality = line.substring(92, 93);
  
        if (airTemperature != MISSING && quality.matches("[01459]")) {
  
          context.write(new Text(year), new IntWritable(airTemperature));
  
        }
  
      }
  
    }
  
    // ^^ MaxTemperatureMapper
  

  
    [root@hadoop-master myclass]# cat MaxTemperatureMapper.java
  
    // cc MaxTemperatureMapper Mapper for maximum temperature example
  
    // vv MaxTemperatureMapper
  
    import java.io.IOException;
  

  
    import org.apache.hadoop.io.IntWritable;
  
    import org.apache.hadoop.io.LongWritable;
  
    import org.apache.hadoop.io.Text;
  
    import org.apache.hadoop.mapreduce.Mapper;
  

  
    public class MaxTemperatureMapper
  
      extends Mapper {
  

  
      private static final int MISSING = 9999;
  

  
      @Override
  
      public void map(LongWritable key, Text value, Context context)
  
          throws IOException, InterruptedException {
  

  
        String line = value.toString();
  
        String year = line.substring(15, 19);
  
        int airTemperature;
  
        if (line.charAt(87) == '+') { // parseInt doesn't like leading plus signs
  
          airTemperature = Integer.parseInt(line.substring(88, 92));
  
        } else {
  
          airTemperature = Integer.parseInt(line.substring(87, 92));
  
        }
  
        String quality = line.substring(92, 93);
  
        if (airTemperature != MISSING && quality.matches("[01459]")) {
  
          context.write(new Text(year), new IntWritable(airTemperature));
  
        }
  
      }
  
    }
  
    // ^^ MaxTemperatureMapper
  

  
    [root@hadoop-master myclass]# cat MaxTemperatureReducer.java
  
    // cc MaxTemperatureReducer Reducer for maximum temperature example
  
    // vv MaxTemperatureReducer
  
    import java.io.IOException;
  

  
    import org.apache.hadoop.io.IntWritable;
  
    import org.apache.hadoop.io.Text;
  
    import org.apache.hadoop.mapreduce.Reducer;
  

  
    public class MaxTemperatureReducer
  
      extends Reducer {
  

  
      @Override
  
      public void reduce(Text key, Iterable values,
  
          Context context)
  
          throws IOException, InterruptedException {
  

  
        int maxValue = Integer.MIN_VALUE;
  
        for (IntWritable value : values) {
  
          maxValue = Math.max(maxValue, value.get());
  
        }
  
        context.write(key, new IntWritable(maxValue));
  
      }
  
    }
  
    // ^^ MaxTemperatureReducer
  

  
4.编译程序
  
    javac -classpath ../hadoop-core-1.1.2.jar *.java
  
    查看
  
    [root@hadoop-master myclass]# ls
  
    MaxTemperature.class  MaxTemperatureMapper.class  MaxTemperatureReducer.class
  
    MaxTemperature.java   MaxTemperatureMapper.java   MaxTemperatureReducer.java
  

  
5.制作jar包
  
    [root@hadoop-master myclass]# jar cvf ../MaxTemperature.jar *.class
  
    added manifest
  
    adding: MaxTemperature.class(in = 1413) (out= 799)(deflated 43%)
  
    adding: MaxTemperatureMapper.class(in = 1876) (out= 805)(deflated 57%)
  
    adding: MaxTemperatureReducer.class(in = 1687) (out= 717)(deflated 57%)
  
    删除类文件:rm -rf *.class
  

  
6.运行程序
  
    分析上述导入文件至in/test的数据集,并将分析结果导出到./out_result中
  
    [root@hadoop-master hadoop-1.1.2]# ./bin/hadoop jar MaxTemperature.jar MaxTemperature ./in/test ./out_result
  
    16/07/13 23:07:54 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
  
    16/07/13 23:07:55 INFO input.FileInputFormat: Total input paths to process : 1
  
    16/07/13 23:07:55 INFO util.NativeCodeLoader: Loaded the native-hadoop library
  
    16/07/13 23:07:55 WARN snappy.LoadSnappy: Snappy native library not loaded
  
    16/07/13 23:07:59 INFO mapred.JobClient: Running job: job_201607131558_0001
  
    16/07/13 23:08:00 INFO mapred.JobClient:  map 0% reduce 0%
  
    16/07/13 23:08:23 INFO mapred.JobClient:  map 100% reduce 0%
  
    16/07/13 23:08:38 INFO mapred.JobClient:  map 100% reduce 100%
  
    16/07/13 23:08:40 INFO mapred.JobClient: Job complete: job_201607131558_0001
  
    16/07/13 23:08:40 INFO mapred.JobClient: Counters: 29
  
    16/07/13 23:08:40 INFO mapred.JobClient:   Map-Reduce Framework
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Spilled Records=300506
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Map output materialized bytes=1652789
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Reduce input records=150253
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=3868651520
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Map input records=150656
  
    16/07/13 23:08:40 INFO mapred.JobClient:     SPLIT_RAW_BYTES=108
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Map output bytes=1352277
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Reduce shuffle bytes=1652789
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Physical memory (bytes) snapshot=295931904
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Reduce input groups=1
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Combine output records=0
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Reduce output records=1
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Map output records=150253
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Combine input records=0
  
    16/07/13 23:08:40 INFO mapred.JobClient:     CPU time spent (ms)=12220
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Total committed heap usage (bytes)=177016832
  
    16/07/13 23:08:40 INFO mapred.JobClient:   File Input Format Counters
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Bytes Read=35197493
  
    16/07/13 23:08:40 INFO mapred.JobClient:   FileSystemCounters
  
    16/07/13 23:08:40 INFO mapred.JobClient:     HDFS_BYTES_READ=35197601
  
    16/07/13 23:08:40 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=3409028
  
    16/07/13 23:08:40 INFO mapred.JobClient:     FILE_BYTES_READ=1652789
  
    16/07/13 23:08:40 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=9
  
    16/07/13 23:08:40 INFO mapred.JobClient:   Job Counters
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Launched map tasks=1
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Launched reduce tasks=1
  
    16/07/13 23:08:40 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=12976
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
  
    16/07/13 23:08:40 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=18769
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Data-local map tasks=1
  
    16/07/13 23:08:40 INFO mapred.JobClient:   File Output Format Counters
  
    16/07/13 23:08:40 INFO mapred.JobClient:     Bytes Written=9
  

  

  
7. 查看结果
  
    [root@hadoop-master hadoop-1.1.2]# ./bin/hadoop fs -ls ./out_result
  
    Found 3 items
  
    -rw-r--r--   3 root supergroup          0 2016-07-13 23:08 /user/root/out_result/_SUCCESS
  
    drwxr-xr-x   - root supergroup          0 2016-07-13 23:08 /user/root/out_result/_logs
  
    -rw-r--r--   3 root supergroup          9 2016-07-13 23:08 /user/root/out_result/part-r-00000
  
    log文件为日志,part文件为结果
  
    查看part文件内容:
  
    [root@hadoop-master hadoop-1.1.2]# ./bin/hadoop fs -cat ./out_result/part-r-00000
  
2008290



运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-627987-1-1.html 上篇帖子: Hadoop之NameNode元数据相关文件目录解析 下篇帖子: Hadoop2.6分布式集群安装配置
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表