设为首页 收藏本站
查看: 596|回复: 0

[经验分享] 【Avro三】Hadoop MapReduce读写Avro文件

[复制链接]

尚未签到

发表于 2016-12-9 07:38:34 | 显示全部楼层 |阅读模式
  Avro是Doug Cutting(此人绝对是神一般的存在)牵头开发的。 开发之初就是围绕着完善Hadoop生态系统的数据处理而开展的(使用Avro作为Hadoop MapReduce需要处理数据序列化和反序列化的场景),因此Hadoop MapReduce集成Avro也就是自然而然的事情。
  这个例子是一个简单的Hadoop MapReduce读取Avro格式的源文件进行计数统计,然后将计算结果作为Avro格式的数据写到目标文件中,主要目的是体会下Hadoop MapReduce操作Avro的基本流程和Avro提供的API

1. Maven依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>learn</groupId>
<artifactId>learn.avro</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!--avro core-->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.7.7</version>
</dependency>
<!--avro rpc support-->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro-ipc</artifactId>
<version>1.7.7</version>
</dependency>
<!--avro utilities for Hadoop MapReduce to process avro files -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro-mapred</artifactId>
<version>1.7.7</version>
</dependency>
<!--Avro and Hadoop Map Reduce-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
</dependency>

</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.avro</groupId>
<artifactId>avro-maven-plugin</artifactId>
<version>1.7.7</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>schema</goal>
<goal>protocol</goal>
<goal>idl-protocol</goal>
</goals>
<configuration>
<sourceDirectory>${project.basedir}/src/main/avro/</sourceDirectory>
<outputDirectory>${project.basedir}/src/main/java/</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

 

2. MapReduce代码:

package examples.avro.mapreduce;
import examples.avro.simple.User;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class MapReduceColorCount extends Configured implements Tool {
///Mapper定义:
///输入Key类型是AvroKey<User>,输入Value类型是NullWritable
///输出Key类型是Text,输出Value类型是IntWritable
public static class ColorCountMapper extends
Mapper<AvroKey<User>, NullWritable, Text, IntWritable> {
@Override
public void map(AvroKey<User> key, NullWritable value, Context context)
throws IOException, InterruptedException {
CharSequence color = key.datum().getFavoriteColor();
if (color == null) {
color = "none";
}
context.write(new Text(color.toString()), new IntWritable(1));
}
}
///Reducer定义:
///输入Key类型是Text,输入Value类型是IntWritable(跟Key的输出Key/Value类型一致)
///输出Key类型是AvroKey<CharSequence>,输出Value类型是AvroValue<Integer>
public static class ColorCountReducer extends
Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> {
@Override
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(new AvroKey<CharSequence>(key.toString()), new AvroValue<Integer>(sum));
}
}
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: MapReduceColorCount <input path> <output path>");
return -1;
}
Job job = new Job(getConf());
job.setJarByClass(MapReduceColorCount.class);
job.setJobName("Color Count");
///指定输入路径,输入文件是Avro格式
FileInputFormat.setInputPaths(job, new Path(args[0]));
///指定输出路径,输出文件格式是Key/Value组成的Avro文件,见AvroKeyValueOutputFormat
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//AvroKeyInputFormat: A MapReduce InputFormat that can handle Avro container files.
job.setInputFormatClass(AvroKeyInputFormat.class);
job.setMapperClass(ColorCountMapper.class);
AvroJob.setInputKeySchema(job, User.getClassSchema());
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//AvroKeyValueOutputFormat: FileOutputFormat for writing Avro container files of key/value pairs
job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
job.setReducerClass(ColorCountReducer.class);
AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new MapReduceColorCount(), args);
System.exit(res);
}
}

3. 主要类注释
  3.1 AvroKey

/** The wrapper of keys for jobs configured with {@link AvroJob} . */
  3.2 AvroValue

/** The wrapper of values for jobs configured with {@link AvroJob} . */
  3.3 AvroJob

/** Setters to configure jobs for Avro data. */
  3.4 AvroKeyInputFormat

/**
* A MapReduce InputFormat that can handle Avro container files.
*
* <p>Keys are AvroKey wrapper objects that contain the Avro data.  Since Avro
* container files store only records (not key/value pairs), the value from
* this InputFormat is a NullWritable.</p>
*/
  3.5 AvroKeyValueOutputFormat

/**
* FileOutputFormat for writing Avro container files of key/value pairs.
*
* <p>Since Avro container files can only contain records (not key/value pairs), this
* output format puts the key and value into an Avro generic record with two fields, named
* 'key' and 'value'.</p>
*
* <p>The keys and values given to this output format may be Avro objects wrapped in
* <code>AvroKey</code> or <code>AvroValue</code> objects.  The basic Writable types are
* also supported (e.g., IntWritable, Text); they will be converted to their corresponding
* Avro types.</p>
*
* @param <K> The type of key. If an Avro type, it must be wrapped in an <code>AvroKey</code>.
* @param <V> The type of value. If an Avro type, it must be wrapped in an <code>AvroValue</code>.
*/
  3.6

  /**
* Sets the job input key schema.
*
* @param job The job to configure.
* @param schema The input key schema.
*/
public static void setInputKeySchema(Job job, Schema schema) {
job.getConfiguration().set(CONF_INPUT_KEY_SCHEMA, schema.toString());
}
/**
* Sets the job input value schema.
*
* @param job The job to configure.
* @param schema The input value schema.
*/
public static void setInputValueSchema(Job job, Schema schema) {
job.getConfiguration().set(CONF_INPUT_VALUE_SCHEMA, schema.toString());
}
  3.7

/**
* Sets the map output key schema.
*
* @param job The job to configure.
* @param schema The map output key schema.
*/
public static void setMapOutputKeySchema(Job job, Schema schema) {
job.setMapOutputKeyClass(AvroKey.class);
job.setGroupingComparatorClass(AvroKeyComparator.class);
job.setSortComparatorClass(AvroKeyComparator.class);
AvroSerialization.setKeyWriterSchema(job.getConfiguration(), schema);
AvroSerialization.setKeyReaderSchema(job.getConfiguration(), schema);
AvroSerialization.addToConfiguration(job.getConfiguration());
}
/**
* Sets the map output value schema.
*
* @param job The job to configure.
* @param schema The map output value schema.
*/
public static void setMapOutputValueSchema(Job job, Schema schema) {
job.setMapOutputValueClass(AvroValue.class);
AvroSerialization.setValueWriterSchema(job.getConfiguration(), schema);
AvroSerialization.setValueReaderSchema(job.getConfiguration(), schema);
AvroSerialization.addToConfiguration(job.getConfiguration());
}

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-311589-1-1.html 上篇帖子: Install and configurate Hadoop 1.1.1 on OS X 下篇帖子: hadoop快速入门,初步感受分布式文件系统
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表