數(shù)據(jù)如下
[root@master conf]# cat ~/relations
Zhangsan Wangwu 01:01:01
Zhangsan Zhaoliu 00:11:21
Zhangsan Yuqi 00:19:01
Zhangsan Jingba 00:21:01
Zhangsan Wuxi 01:31:17
Wangwu Zhaoliu 00:51:01
Wangwu Zhaoliu 01:11:19
Wangwu Yuqi 00:00:21
Wangwu Yuqi 00:23:01
Yuqi Zhaoliu 01:18:01
Yuqi Wuxi 00:18:00
Jingba Wangwu 00:01:01
Jingba Wangwu 00:00:06
Jingba Wangwu 00:02:04
Jingba Wangwu 00:02:54
Wangwu Yuqi 01:00:13
Wangwu Yuqi 00:01:01
Wangwu Zhangsan 00:01:01
要求:兩個(gè)人統(tǒng)計(jì)相互通話總時(shí)長,并從高到低排列形成result1表,包括通話人和總時(shí)長兩個(gè)字段
解答
字段轉(zhuǎn)換
[root@master conf]# cat test_mapper.py
import sys
for line in sys.stdin:
line = line.strip()
fromstr, tostr, time = line.split('\t')
hours, minutes, secondes = time.split(':')
newtime = int(hours)*60*60 + int(minutes)*60 + int(secondes)
if cmp(fromstr, tostr) == -1:
#print(cmp(fromstr, tostr))
fromstr, tostr = tostr, fromstr
print ' '.join([fromstr + tostr, str(newtime)])
hive腳本
[root@master conf]# cat test.hive
CREATE TABLE relations_new (
fromtostr STRING,
duration INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ' ';
add FILE test_mapper.py;
INSERT OVERWRITE TABLE relations_new
SELECT
TRANSFORM (fromstr, tostr, duration)
USING 'python test_mapper.py'
AS (fromtostr, duration)
FROM relations;
SELECT fromtostr, SUM(duration)
FROM relations_new
GROUP BY fromtostr;
運(yùn)行hive腳本
[root@master conf]# hive -f test.hive