1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
| #!/usr/bin/env python
#coding:utf-8
#找出cdn日志指定时间段,url访问次数最多的前10个;
import sys
import os
import string
import re
import MySQLdb
#加载hive的python相关库文件;
sys.path.append('/usr/local/hive_py')
from hive_service import ThriftHive
from hive_service.ttypes import HiveServerException
from thrift import Thrift
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol
dbname="default"
hsql="select request,count(request) as counts from cdnlog where time >= '[27/Oct/2014:10:40:00 +0800]' and time <= '[27/Oct/2014:10:49
:59 +0800]' group by request order by counts desc limit 10"
def hiveExe(hsql,dbname):
try:
transport = TSocket.TSocket('172.16.41.151', 10000)
transport = TTransport.TBufferedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = ThriftHive.Client(protocol)
transport.open()
#加载增长表达式支持,必需(以下加载路径为远端hive路径,不是脚本的本地路径!)
client.execute('add jar /usr/local/hive-0.8.1/lib/hive_contrib.jar')
# client.execute("use " + dbname)
# row = client.fetchOne()
client.execute(hsql)
return client.fetchAll() #查询所有数据;
transport.close()
except Thrift.TException, tx:
print '%s' % (tx.message)
if __name__ == '__main__':
results=hiveExe(hsql,dbname)
num=len(results)
for i in range(num):
|