背景
需要對用戶的訪問日志記錄進行轉(zhuǎn)化分析葫笼,由于日志記錄沒有業(yè)務session,需要物理地對訪問記錄進行session切分奕谭,規(guī)則是30分鐘內(nèi)的用戶操作為一個會話(session)撵幽,如果兩個操作之間的時間間隔超過30分鐘斯稳,則判定為兩個session。
環(huán)境:GreenPlum(postgresql)
數(shù)據(jù)
/*
用戶ID:bid
訪問時間:event_time
頁面key:event_key
*/
WITH log_data AS(
SELECT
bid,
event_time::TIMESTAMP AS event_time,
event_key,
/* 訪問序號 */
ROW_NUMBER() OVER(PARTITION BY bid ORDER BY event_time) AS num
FROM
(
SELECT '123' AS bid, '2017-11-20 10:01:56' AS event_time, 'page1' AS event_key
UNION ALL
SELECT '123' AS bid, '2017-11-20 10:05:56' AS event_time, 'page2' AS event_key
UNION ALL
SELECT '123' AS bid, '2017-11-20 10:43:56' AS event_time, 'page1' AS event_key
UNION ALL
SELECT '123' AS bid, '2017-11-20 10:46:56' AS event_time, 'page2' AS event_key
UNION ALL
SELECT '123' AS bid, '2017-11-20 10:49:56' AS event_time, 'page3' AS event_key
)tmp
)
SELECT * FROM log_data;
圖一
切分session過程
需要找到一個記錄A的時間減去上一個記錄B的時間差值大于30分鐘庄萎,如下圖中踪少,需要找到箭頭指向的那條10:43:56訪問的記錄
圖二
問題轉(zhuǎn)化為尋找每個session的起始訪問記錄,詳細sql如下:
SELECT
t1.*,
t2.num,
/* 如果記錄為首次或兩個記錄時間差大于30分鐘則打上標記 */
CASE WHEN t2. event_time is NULL OR t1.event_time - t2.event_time > INTERVAL '30 minute' THEN t1.num ELSE null END AS time_diff
FROM
log_data t1
LEFT JOIN
/* 前后記錄關(guān)聯(lián) */
log_data t2 on t1.bid = t2.bid AND t1.num = t2.num + 1
根據(jù)t1.bid = t2.bid AND t1.num = t2.num + 1關(guān)聯(lián)得到切分點的num糠涛,如下圖中的time_diff
圖三
通過time_diff is not null 得到具體的切分點
SELECT
bid,
time_diff,
event_time
FROM
(
SELECT
t1.*,
t2.num,
/* 如果記錄為首次或兩個記錄時間差大于30分鐘則打上標記 */
CASE WHEN t2. event_time is NULL OR t1.event_time - t2.event_time > INTERVAL '30 minute' THEN t1.num ELSE null END AS time_diff
FROM
log_data t1
LEFT JOIN
/* 前后記錄關(guān)聯(lián) */
log_data t2 on t1.bid = t2.bid AND t1.num = t2.num + 1
)t02
WHERE
t02.time_diff IS NOT NULL
圖四
圖一跟圖四通過t01.bid = t02.bid AND t01.num >= t02.time_diff關(guān)聯(lián)排序過濾最終完成session切分
SELECT
*
FROM
(
SELECT
t01.*,
t02.event_time,
time_diff,
/* 取分等值連接后的第一條數(shù)據(jù) */
ROW_NUMBER() OVER(PARTITION BY t01.bid,t01.num ORDER BY t02.time_diff desc) AS num_2
FROM
log_data t01
LEFT JOIN
(
SELECT
bid,
time_diff,
event_time
FROM
(
SELECT
t1.*,
t2.num,
/* 如果記錄為首次或兩個記錄時間差大于30分鐘則打上標記 */
CASE WHEN t2. event_time is NULL OR t1.event_time - t2.event_time > INTERVAL '30 minute' THEN t1.num ELSE null END AS time_diff
FROM
log_data t1
LEFT JOIN
/* 前后記錄關(guān)聯(lián) */
log_data t2 on t1.bid = t2.bid AND t1.num = t2.num + 1
)t02
WHERE
t02.time_diff IS NOT NULL
)t02 ON t01.bid = t02.bid AND t01.num >= t02.time_diff
)tmp
/* 取num_2為1的記錄 */
WHERE num_2 = 1
圖五
圖六
再添加session_id字段即可援奢,本文采用md5(bid+event_time1)作為session_id