spring hadoop之batch处理(二)

高峰之巅 · 发表于 2016-12-10 10:39:53

一、测试
public class MrBatchApp {
// Log
private static final Log log = LogFactory.getLog(MrBatchApp.class);
//
public static void main(String[] args) throws    JobParametersInvalidException, JobExecutionAlreadyRunningException, JobRestartException, JobInstanceAlreadyCompleteException {
System.out.println("TEST");
// 加载对应的xml配置文件
AbstractApplicationContext context = new ClassPathXmlApplicationContext("classpath:/META-INF/spring/*-context.xml");
      log.info("Batch Tweet Hashtag MR Job Running");
      // 关闭"钩子" 为了方便在适当的时候关闭 spring ioc
      // （在非web环境下，关闭spring ioc需要手动完成）
      context.registerShutdownHook();
      // job 发射器
      // JobLaucher是一个简化的job的控制接口；基于运行时不同的标识
      // 该接口并不能确保执行job是同步还是异步
      JobLauncher jobLauncher = context.getBean(JobLauncher.class);
      // job
      Job job = context.getBean(Job.class);
      // 运行job
      jobLauncher.run(job, new JobParameters());

}
}
二、xml配置文件
（1）、common 配置
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:p="http://www.springframework.org/schema/p"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">
// job 仓库
<bean id="jobRepository" class="org.springframework.batch.core.repository.support.MapJobRepositoryFactoryBean"/>
// 事务
<bean id="transactionManager" class="org.springframework.batch.support.transaction.ResourcelessTransactionManager"/>
// job launcher
<bean id="jobLauncher" class="org.springframework.batch.core.launch.support.SimpleJobLauncher" p:jobRepository-ref="jobRepository"/>
</beans>
(2)、特殊配置
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:batch="http://www.springframework.org/schema/batch"
xmlns:hdp="http://www.springframework.org/schema/hadoop"
xmlns:context="http://www.springframework.org/schema/context"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
   http://www.springframework.org/schema/batchhttp://www.springframework.org/schema/batch/spring-batch.xsd
   http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd
   http://www.springframework.org/schema/hadoop http://www.springframework.org/schema/hadoop/spring-hadoop.xsd">
// 引入common 配置
<import resource="batch-common-context.xml"/>
// hdfs uri/分析目录/统计目录/分析源文件
<context:property-placeholder location="hadoop.properties"/>

<context:component-scan base-package="org.springframework.samples.hadoop.mapreduce" />
// 指定hdfs
<hdp:configuration>
      fs.defaultFS=${hd.fs}
</hdp:configuration>
// 设定hdp 脚本用于创建localSourceFile、inputDir、outputDir
<hdp:script id="setupScript" location="file-prep.groovy" run-at-startup="true">
      <hdp:property name="localSourceFile" value="${localSourceFile}"/>
      <hdp:property name="inputDir" value="${tweets.input.path}"/>
      <hdp:property name="outputDir" value="${tweets.output.path}"/>
</hdp:script>
// 指定mapreduce step to step执行

<bean class="org.springframework.batch.core.scope.StepScope">
      <property name="proxyTargetClass" value="true"/>
</bean>
// 设置job steps
<job id="job" xmlns="http://www.springframework.org/schema/batch">
      <step id="hashtagcount" next="result-step">
         <tasklet ref="hashtagcount-tasklet" />
      </step>
      <step id="result-step">
         <tasklet ref="results"/>
      </step>
</job>

<hdp:job-tasklet id="hashtagcount-tasklet" job-ref="hashtagcountJob" scope="step"/>

<hdp:job id="hashtagcountJob"
      input-path="${tweets.input.path}"
      output-path="${tweets.output.path}"
      mapper="org.springframework.samples.hadoop.mapreduce.HashtagCount$TokenizerMapper"
      reducer="org.springframework.samples.hadoop.mapreduce.HashtagCount$LongSumReducer"
      scope="step" />
// 指定统计结果输出
<hdp:script-tasklet id="results" scope="step">
      <hdp:script location="classpath:results.groovy">
         <hdp:property name="outputDir" value="${tweets.output.path}"/>
      </hdp:script>
</hdp:script-tasklet>

</beans>
三、groovy脚本
// 判断分析源文件所在的目录是否存在不存在创建并将源文件复制到指定目录下
// 同时修改该文件夹的权限
if (!fsh.test(inputDir)) {
fsh.mkdir(inputDir);
fsh.copyFromLocal(localSourceFile, inputDir);
fsh.chmod(700, inputDir)
}
// 判断统计结果目录是否存在存在则删除
if (fsh.test(outputDir)) {
fsh.rmr(outputDir)
}
-----------------------------------------------------------------------
// 输出分析统计结果的内容
println "RESULTS from " + outputDir
old = new File('results.txt')
if( old.exists() ) {
old.delete()
}
fsh.get(outputDir + '/part-r-*', 'results.txt');
String fileContents = new File('results.txt').text
println fileContents
以上即可完全通过xml完成mapreduce的batch处理

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] spring hadoop之batch处理(二)

浏览过的版块

扫码加入运维网微信交流群