預備知識:
了解hive窗口函數(shù):LAG 和 LEAD
數(shù)據(jù)準備:
cookie1,2015-04-10 10:00:02,url2
cookie1,2015-04-10 10:00:00,url1
cookie1,2015-04-10 10:03:04,1url3
cookie1,2015-04-10 10:50:05,url6
cookie1,2015-04-10 11:00:00,url7
cookie1,2015-04-10 10:10:00,url4
cookie1,2015-04-10 10:50:01,url5
cookie2,2015-04-10 10:00:02,url22
cookie2,2015-04-10 10:00:00,url11
cookie2,2015-04-10 10:03:04,1url33
cookie2,2015-04-10 10:50:05,url66
cookie2,2015-04-10 11:00:00,url77
cookie2,2015-04-10 10:10:00,url44
cookie2,2015-04-10 10:50:01,url55
CREATE EXTERNAL TABLE lxw1234 (
cookieid string,
createtime string,? --頁面訪問時間
url STRING? ? ? --被訪問頁面
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
stored as textfile location '/tmp/lxw11/';
hive> select * from lxw1234;
OK
cookie1 2015-04-10 10:00:02? ? url2
cookie1 2015-04-10 10:00:00? ? url1
cookie1 2015-04-10 10:03:04? ? 1url3
cookie1 2015-04-10 10:50:05? ? url6
cookie1 2015-04-10 11:00:00? ? url7
cookie1 2015-04-10 10:10:00? ? url4
cookie1 2015-04-10 10:50:01? ? url5
cookie2 2015-04-10 10:00:02? ? url22
cookie2 2015-04-10 10:00:00? ? url11
cookie2 2015-04-10 10:03:04? ? 1url33
cookie2 2015-04-10 10:50:05? ? url66
cookie2 2015-04-10 11:00:00? ? url77
cookie2 2015-04-10 10:10:00? ? url44
cookie2 2015-04-10 10:50:01? ? url55
LAG
LAG(col,n,DEFAULT) 用于統(tǒng)計窗口內(nèi)往上第n行值
第一個參數(shù)為列名沾谜,第二個參數(shù)為往上第n行(可選,默認為1),第三個參數(shù)為默認值(當往上第n行為NULL時候,取默認值,如不指定椒涯,則為NULL)
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAG(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS last_1_time,
LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS last_2_time
FROM lxw1234;
cookieid createtime? ? ? ? ? ? url? ? rn? ? ? last_1_time? ? ? ? ? ? last_2_time
-------------------------------------------------------------------------------------------
cookie1 2015-04-10 10:00:00? ? url1? ? 1? ? ? 1970-01-01 00:00:00? ? NULL
cookie1 2015-04-10 10:00:02? ? url2? ? 2? ? ? 2015-04-10 10:00:00? ? NULL
cookie1 2015-04-10 10:03:04? ? 1url3? 3? ? ? 2015-04-10 10:00:02? ? 2015-04-10 10:00:00
cookie1 2015-04-10 10:10:00? ? url4? ? 4? ? ? 2015-04-10 10:03:04? ? 2015-04-10 10:00:02
cookie1 2015-04-10 10:50:01? ? url5? ? 5? ? ? 2015-04-10 10:10:00? ? 2015-04-10 10:03:04
cookie1 2015-04-10 10:50:05? ? url6? ? 6? ? ? 2015-04-10 10:50:01? ? 2015-04-10 10:10:00
cookie1 2015-04-10 11:00:00? ? url7? ? 7? ? ? 2015-04-10 10:50:05? ? 2015-04-10 10:50:01
cookie2 2015-04-10 10:00:00? ? url11? 1? ? ? 1970-01-01 00:00:00? ? NULL
cookie2 2015-04-10 10:00:02? ? url22? 2? ? ? 2015-04-10 10:00:00? ? NULL
cookie2 2015-04-10 10:03:04? ? 1url33? 3? ? ? 2015-04-10 10:00:02? ? 2015-04-10 10:00:00
cookie2 2015-04-10 10:10:00? ? url44? 4? ? ? 2015-04-10 10:03:04? ? 2015-04-10 10:00:02
cookie2 2015-04-10 10:50:01? ? url55? 5? ? ? 2015-04-10 10:10:00? ? 2015-04-10 10:03:04
cookie2 2015-04-10 10:50:05? ? url66? 6? ? ? 2015-04-10 10:50:01? ? 2015-04-10 10:10:00
cookie2 2015-04-10 11:00:00? ? url77? 7? ? ? 2015-04-10 10:50:05? ? 2015-04-10 10:50:01
last_1_time: 指定了往上第1行的值,default為'1970-01-01 00:00:00'?
? ? ? ? ? ? cookie1第一行,往上1行為NULL,因此取默認值 1970-01-01 00:00:00
? ? ? ? ? ? cookie1第三行钾恢,往上1行值為第二行值,2015-04-10 10:00:02
? ? ? ? ? ? cookie1第六行吕喘,往上1行值為第五行值赘那,2015-04-10 10:50:01
last_2_time: 指定了往上第2行的值,為指定默認值
cookie1第一行氯质,往上2行為NULL
cookie1第二行募舟,往上2行為NULL
cookie1第四行,往上2行為第二行值闻察,2015-04-10 10:00:02
cookie1第七行拱礁,往上2行為第五行值,2015-04-10 10:50:01
LEAD
與LAG相反
LEAD(col,n,DEFAULT) 用于統(tǒng)計窗口內(nèi)往下第n行值
第一個參數(shù)為列名辕漂,第二個參數(shù)為往下第n行(可選呢灶,默認為1),第三個參數(shù)為默認值(當往下第n行為NULL時候钉嘹,取默認值鸯乃,如不指定,則為NULL)
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LEAD(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time,
LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time
FROM lxw1234;
cookieid createtime? ? ? ? ? ? url? ? rn? ? ? next_1_time? ? ? ? ? ? next_2_time
-------------------------------------------------------------------------------------------
cookie1 2015-04-10 10:00:00? ? url1? ? 1? ? ? 2015-04-10 10:00:02? ? 2015-04-10 10:03:04
cookie1 2015-04-10 10:00:02? ? url2? ? 2? ? ? 2015-04-10 10:03:04? ? 2015-04-10 10:10:00
cookie1 2015-04-10 10:03:04? ? 1url3? 3? ? ? 2015-04-10 10:10:00? ? 2015-04-10 10:50:01
cookie1 2015-04-10 10:10:00? ? url4? ? 4? ? ? 2015-04-10 10:50:01? ? 2015-04-10 10:50:05
cookie1 2015-04-10 10:50:01? ? url5? ? 5? ? ? 2015-04-10 10:50:05? ? 2015-04-10 11:00:00
cookie1 2015-04-10 10:50:05? ? url6? ? 6? ? ? 2015-04-10 11:00:00? ? NULL
cookie1 2015-04-10 11:00:00? ? url7? ? 7? ? ? 1970-01-01 00:00:00? ? NULL
cookie2 2015-04-10 10:00:00? ? url11? 1? ? ? 2015-04-10 10:00:02? ? 2015-04-10 10:03:04
cookie2 2015-04-10 10:00:02? ? url22? 2? ? ? 2015-04-10 10:03:04? ? 2015-04-10 10:10:00
cookie2 2015-04-10 10:03:04? ? 1url33? 3? ? ? 2015-04-10 10:10:00? ? 2015-04-10 10:50:01
cookie2 2015-04-10 10:10:00? ? url44? 4? ? ? 2015-04-10 10:50:01? ? 2015-04-10 10:50:05
cookie2 2015-04-10 10:50:01? ? url55? 5? ? ? 2015-04-10 10:50:05? ? 2015-04-10 11:00:00
cookie2 2015-04-10 10:50:05? ? url66? 6? ? ? 2015-04-10 11:00:00? ? NULL
cookie2 2015-04-10 11:00:00? ? url77? 7? ? ? 1970-01-01 00:00:00? ? NULL
--邏輯與LAG一樣跋涣,只不過LAG是往上缨睡,LEAD是往下鸟悴。
進入主題,hive實現(xiàn)拉鏈表示例:
-----目標表
create external table existing_time_series_table
(
primary_key string,?---業(yè)務主鍵(字段個數(shù)不限)
effective_dt bigint,?----開始日期
expired_dt bigint,??----失效日期
event_value string----業(yè)務員度量值
? )
? stored as parquet
? location?
? ?'hdfs://nameservice/it/ods/erp/existing_time_series_table';
?----增量結(jié)果集 ??
create external table new_time_series_table
(
primary_key string,---業(yè)務主鍵(字段個數(shù)不限)
effective_dt bigint,?----開始日期
event_value string----業(yè)務員度量值
? )
? stored as parquet
? location?
? ?'hdfs://nameservice/it/ods/erp/new_time_series_table';
-----邏輯實現(xiàn):lead函數(shù)實現(xiàn)了取下個日期作為本記錄的失效日期
insert overwrite table?existing_time_series_table
?select primary_key,
? ? ? ? ?effective_dt,
?case
? ? ? ? ? ?when lead(effective_dt, 1, null)
? ? ? ? ? ? over(partition by primary_key order by effective_dt) is null then
? ? ? ? ? ? null
? ? ? ? ? ?else
? ? ? ? ? ? lead(effective_dt, 1, null)
? ? ? ? ? ? over(partition by primary_key order by effective_dt)
? ? ? ? ?end as expired_dt,
? ? ? ? ?event_value
? ? from (select primary_key, effective_dt, event_value
? ? ? ? ? ? from existing_time_series_table
? ? ? ? ? ?where expired_dt is null
? ? ? ? ? union all
? ? ? ? ? select primary_key, effective_dt, event_value
? ? ? ? ? ? from new_time_series_table) sub_1
? union all ??
-----歷史已經(jīng)失效的記錄
? select primary_key, effective_dt, expired_dt, event_value
? ? from existing_time_series_table
? ?where expired_dt is not null