基于hive的hadoop日志分析

zhwz · 发表于 2016-12-10 08:05:36

　　1.日志格式
日期时间级别相关类信息
2011-08-01 08:39:08,020 INFO org.apache.hadoop.ipc.Server:IPC server Responder......
2.存储结构
日期时间级别相关类各占一列信息占3列
create table if not exists loginfo11(rdate string,time array<string>,type string,relateclass string,infomation1 string,infomation2 string,infomation3 string)
row format delimited fields terminated by '' collection items terminated by ',' map keys terminated by ':';
3.getConnect.java
新建工程hiveAction，导入 hadoop的jar包，导入mysql的驱动包
getConnect类负责建立与Hive和MYSQL的连接，由于每个连接的开销比较大，所以此类的设计采用的是单例模式：

package com.cstore.transToHive;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
public class getConnect {
private static Connection conn = null;
private static Connection conntomysql = null;
private static final String DBURL="jdbc:hive://192.168.1.13:50031/default";
private static final String DBUSERNAME = "hive";
private static final String DBPASSWORD = "123456";
private getConnect() {
}
// 获得与Hive的连接，如果连接已经初始化，则直接返回
public static Connection getHiveConn() throws SQLException {
if (conn == null) {
try {
Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver");
} catch (ClassNotFoundException e) {
e.printStackTrace();
System.exit(1);
}
conn = DriverManager.getConnection(
DBURL, "hive", "123456");
System.out.println(1111);
}
return conn;
}
// 获得与MYSQL的连接
public static Connection getMysqlConn() throws SQLException {
if (conntomysql == null) {
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
e.printStackTrace();
System.exit(1);
}
conntomysql = DriverManager
.getConnection(
"jdbc:mysql://192.168.1.12:3306/hadoop?useUnicode=true&characterEncoding=GBK",
"hive", "123456");
System.out.println(1111);
}
return conntomysql;
}
// 在完成所有操作之后，调用次方法关闭本次会话的连接
public static void closeHive() throws SQLException {
if (conn != null)
conn.close();
}
public static void closemysql() throws SQLException {
if (conntomysql != null)
conntomysql.close();
}
}
　　 4.HiveUtil.java

package com.cstore.transToHive;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
public class HiveUtil {
public static void createTable(String hiveql) throws SQLException // 创建表
{
Connection con = getConnect.getHiveConn();
Statement stmt = con.createStatement();
ResultSet res = stmt.executeQuery(hiveql);
}
public static ResultSet queryHive(String hiveql) throws SQLException // 查询表
{
Connection con = getConnect.getHiveConn();
Statement stmt = con.createStatement();
ResultSet res = stmt.executeQuery(hiveql);
return res;
}
public static void loadDate(String hiveql) throws SQLException // 加载数据
{
Connection con = getConnect.getHiveConn();
Statement stmt = con.createStatement();
ResultSet res = stmt.executeQuery(hiveql);
}
public static void hiveTomysql(ResultSet Hiveres) throws SQLException // 数据转存至mysql
{
Connection con = getConnect.getMysqlConn();
Statement stmt = con.createStatement();
while (Hiveres.next()) {
String rdate = Hiveres.getString(1);
String time = Hiveres.getString(2);
String type = Hiveres.getString(3);
String relateclass = Hiveres.getString(4);
String information = Hiveres.getString(5) + Hiveres.getString(6)
+ Hiveres.getString(7);// 可以使用udaf实现
System.out.println(rdate + " " + time + " " + type + " "
+ relateclass + " " + information + " ");
int i = stmt.executeUpdate("insert into hadooplog values(0,'"
+ rdate + "','" + time + "','" + type + "','" + relateclass
+ "','" + information + "')");
}
}
}

　　 5.exeHiveQL.java
　　新建类，其中包含main方法，他是个驱动类，运行时需要两个参数，日志级别和日期
程序执行时：程序首先在Hive数据仓库中建立表，然后加载hadoop日志，过滤有用的日志信息后并转存到mysql数据库中

package com.cstore.transToHive;
import java.sql.ResultSet;
import java.sql.SQLException;
public class exeHiveQL {
public static void main(String[] args) throws SQLException {
if (args.length < 2) {
System.out.print("请输入你要查询的条件：日志级别日志信息");
System.exit(1);
}
String type = args[0];
String date = args[1];
// 在Hive中创建表
HiveUtil.createTable("create table if not exists loginfo11 ( rdate String,time ARRAY<string>,type STRING,relateclass STRING,information1 STRING,information2 STRING,information3 STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY ':'");
// 加载Hadoop日志文件，*表示加载所有的日志文件
HiveUtil.loadDate("load data local inpath '/usr/local/hadoop2/logs/*' overwrite into table loginfo11");
// 查询有用的信息，这里依据日期和日志级别过滤信息
ResultSet res1 = HiveUtil
.queryHive("select rdate,time[0],type,relateclass,information1,information2,information3 from loginfo11 where type='ERROR' and rdate='2011-07-29' ");
// 查询的信息经过变换后存如mysql中
HiveUtil.hiveTomysql(res1);
// 最后关闭此次会话的hive连接
getConnect.closeHive();
// 关闭mysql连接
getConnect.closemysql();
}
}
　　 6.测试
　　运行程序前需要在装有Hive的机器上启动HiveServer服务并指定一个端口监听
hive --service hiveserver 50031
运行exeHiveQL.java，输入参数作为查询的条件查找用户所关注的信息，如查询2011年7月29日所有的ERROR信息，那么参数就是 ERROR 2011-07-29，运行后可以看到Hiveserver的控制台上输入了运行时的信息，程序执行完毕后去mysql的控制台，查看hadooplog表中的结果

7.分析总结
本次程序的实现其实mapreduce也能够做，因为Hive的底层调用的就是mapreduce，所以hive的效率没有mapreduce的高
mapreduce：效率高编程复杂
hive ：效率低编程简单
（上文中的效率高或者效率低仅仅是mapreduce和hive之间相对来说的）

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] 基于hive的hadoop日志分析

扫码加入运维网微信交流群