目錄
- 1 窗口函數(shù) Windowing functions
- 2 OVER詳解 The OVER clause
- 2.1 標(biāo)準(zhǔn)聚合函數(shù)
- 2.2 分析函數(shù) Analytics functions
- 2.3 OVER子句也支持聚合函數(shù)
- 2.4 window clause 的另一種寫法
1 窗口函數(shù) Windowing functions
FIRST_VALUE(col, bool DEFAULT)
返回分組窗口內(nèi)第一行col的值矾柜,DEFAULT默認(rèn)為false阱驾,如果指定為true,則跳過(guò)NULL后再取值
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, NULL AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
FIRST_VALUE(col) over(partition by group_id order by col) as col_new
FROM tmp;
group_id | col | col_new |
---|---|---|
1 | a | a |
1 | b | a |
1 | c | a |
2 | NULL | NULL |
2 | e | NULL |
WITH tmp AS
(
SELECT 1 AS group_id, NULL AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, NULL AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
FIRST_VALUE(col, true) over(partition by group_id order by col) as col_new
FROM tmp;
group_id | col | col_new |
---|---|---|
1 | NULL | NULL |
1 | b | b |
1 | c | b |
2 | NULL | NULL |
2 | e | e |
LAST_VALUE(col, bool DEFAULT)
返回分組窗口內(nèi)最后一行col的值怪蔑,DEFAULT默認(rèn)為false里覆,如果指定為true,則跳過(guò)NULL后再取值
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, NULL AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'd' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
LAST_VALUE(col) over(partition by group_id order by col desc) as col_new
FROM tmp;
group_id | col | col_new |
---|---|---|
1 | c | c |
1 | a | a |
1 | NULL | NULL |
2 | e | e |
2 | d | d |
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, NULL AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'd' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
LAST_VALUE(col, true) over(order by group_id,col desc rows between 1 preceding and 1 following) as col_new
FROM tmp;
group_id | col | col_new |
---|---|---|
1 | c | a |
1 | a | a |
1 | NULL | e |
2 | e | d |
2 | d | d |
LEAD(col, n, DEFAULT)
返回分組窗口內(nèi)往下第n行col的值缆瓣,n默認(rèn)為1喧枷,往下第n沒有時(shí)返回DEFAULT(DEFAULT默認(rèn)為NULL)
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'd' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
LEAD(col) over(partition by group_id order by col) as col_new
FROM tmp;
等同于:
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'd' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
LAST_VALUE(col) over(partition by group_id order by col rows between 1 FOLLOWING and 1 FOLLOWING) as col_new
FROM tmp;
返回結(jié)果都是:
group_id | col | col_new |
---|---|---|
1 | a | b |
1 | b | c |
1 | c | NULL |
2 | d | e |
2 | e | NULL |
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'd' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
LEAD(col, 2, 'z') over(partition by group_id order by col) as col_new
FROM tmp;
返回結(jié)果:
group_id | col | col_new |
---|---|---|
1 | a | c |
1 | b | z |
1 | c | z |
2 | d | z |
2 | e | z |
LAG(col, n, DEFAULT)
返回分組窗口內(nèi)往上第n行col的值,n默認(rèn)為1,往上第n沒有時(shí)返回DEFAULT(DEFAULT默認(rèn)為NULL)
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'd' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
LAG(col) over(partition by group_id order by col) as col_new
FROM tmp;
等同于:
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'd' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
FIRST_VALUE(col) over(partition by group_id order by col rows BETWEEN 1 PRECEDING and 1 PRECEDING) as col_new
FROM tmp;
返回結(jié)果都是:
group_id | col | col_new |
---|---|---|
1 | a | NULL |
1 | b | a |
1 | c | b |
2 | d | NULL |
2 | e | d |
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'd' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
LAG(col, 2, 'zz') over(partition by group_id order by col) as col_new
FROM tmp;
返回結(jié)果:
group_id | col | col_new |
---|---|---|
1 | a | zz |
1 | b | zz |
1 | c | a |
2 | d | zz |
2 | e | zz |
2 OVER詳解 The OVER clause
FUNCTION(expr) OVER([PARTITION BY statement] [ORDER BY statement] [window clause])
- FUNCTION:包括標(biāo)準(zhǔn)聚合函數(shù)(COUNT隧甚、SUM车荔、MIN、MAX戚扳、AVG)和一些分析函數(shù)(RANK忧便、ROW_NUMBER、DENSE_RANK等)
- PARTITION BY:可以由一個(gè)或者多個(gè)列組成
- ORDER BY:可以由一個(gè)或者多個(gè)列組成
- window clause:(ROWS | RANGE) BETWEEN (UNBOUNDED PRECEDING | num PRECEDING | CURRENT ROW) AND (UNBOUNDED PRECEDING | num PRECEDING | CURRENT ROW)
- 當(dāng) window clause 未指定時(shí)帽借,默認(rèn)為 RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW珠增,即分組內(nèi)第一行至當(dāng)前行作為窗口
- 當(dāng) window clause 和 ORDER BY 都未指定時(shí),默認(rèn)為 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING砍艾,即分組內(nèi)第一行至最后一行作為窗口
2.1 標(biāo)準(zhǔn)聚合函數(shù)
COUNT(expr) OVER()
返回窗口內(nèi)行數(shù)
WITH tmp AS
(
SELECT 1 AS group_id, 'a' AS col
UNION ALL SELECT 1 AS group_id, 'b' AS col
UNION ALL SELECT 1 AS group_id, 'c' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
UNION ALL SELECT 2 AS group_id, 'e' AS col
)
SELECT group_id,
col,
count(col) over(partition by group_id) as cnt1,
count(col) over(partition by group_id order by col) as cnt2,
count(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as cnt3,
count(distinct col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as cnt4
FROM tmp;
group_id | col | cnt1 | cnt2 | cnt3 | cnt4 |
---|---|---|---|---|---|
1 | a | 3 | 1 | 3 | 3 |
1 | b | 3 | 2 | 2 | 2 |
1 | c | 3 | 3 | 1 | 1 |
2 | e | 2 | 2 | 2 | 1 |
2 | e | 2 | 2 | 1 | 1 |
SUM(expr) OVER()
返回窗口內(nèi)求和值
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 2 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
)
SELECT group_id,
col,
SUM(col) over(partition by group_id) as sum1,
SUM(col) over(partition by group_id order by col) as sum2,
SUM(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as sum3,
SUM(distinct col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as sum4
FROM tmp;
group_id | col | sum1 | sum2 | sum3 | sum4 |
---|---|---|---|---|---|
1 | 1 | 6 | 1 | 6 | 6 |
1 | 2 | 6 | 3 | 5 | 5 |
1 | 3 | 6 | 6 | 3 | 3 |
2 | 4 | 8 | 8 | 8 | 4 |
2 | 4 | 8 | 8 | 4 | 4 |
MIN(expr) OVER()
返回窗口內(nèi)最小值
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 2 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
col,
MIN(col) over(partition by group_id) as min1,
MIN(col) over(partition by group_id order by col) as min2,
MIN(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as min3
FROM tmp;
group_id | col | min1 | min2 | min3 |
---|---|---|---|---|
1 | 1 | 1 | 1 | 1 |
1 | 2 | 1 | 1 | 2 |
1 | 3 | 1 | 1 | 3 |
2 | 4 | 4 | 4 | 4 |
2 | 5 | 4 | 4 | 5 |
MAX(expr) OVER()
返回窗口內(nèi)最大值
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 2 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
col,
MAX(col) over(partition by group_id) as max1,
MAX(col) over(partition by group_id order by col) as max2,
MAX(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as max3
FROM tmp;
group_id | col | max1 | max2 | max3 |
---|---|---|---|---|
1 | 1 | 3 | 1 | 3 |
1 | 2 | 3 | 2 | 3 |
1 | 3 | 3 | 3 | 3 |
2 | 4 | 5 | 4 | 5 |
2 | 5 | 5 | 5 | 5 |
AVG(expr) OVER()
返回窗口內(nèi)平均值
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 2 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
)
SELECT group_id,
col,
AVG(col) over(partition by group_id) as avg1,
AVG(col) over(partition by group_id order by col) as avg2,
AVG(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as avg3,
AVG(distinct col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as avg4
FROM tmp;
group_id | col | avg1 | avg2 | avg3 | avg4 |
---|---|---|---|---|---|
1 | 1 | 2.0 | 1.0 | 2.0 | 2.0 |
1 | 2 | 2.0 | 1.5 | 2.5 | 2.5 |
1 | 3 | 2.0 | 2.0 | 3.0 | 3.0 |
2 | 4 | 4.0 | 4.0 | 4.0 | 4.0 |
2 | 4 | 4.0 | 4.0 | 4.0 | 4.0 |
2.2 分析函數(shù) Analytics functions
RANK() OVER()
返回分組內(nèi)排名(不支持自定義窗口)
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
col,
RANK() over(partition by group_id order by col desc) as r
FROM tmp;
group_id | col | r |
---|---|---|
1 | 3 | 1 |
1 | 3 | 1 |
1 | 1 | 3 |
2 | 5 | 1 |
2 | 4 | 2 |
ROW_NUMBER() OVER()
返回分組內(nèi)行號(hào)(不支持自定義窗口)
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
col,
ROW_NUMBER() over(partition by group_id order by col desc) as r
FROM tmp;
group_id | col | r |
---|---|---|
1 | 3 | 1 |
1 | 3 | 2 |
1 | 1 | 3 |
2 | 5 | 1 |
2 | 4 | 2 |
DENSE_RANK() OVER()
返回分組內(nèi)排名(排名相等不會(huì)留下空位蒂教,不支持自定義窗口)
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
col,
DENSE_RANK() over(partition by group_id order by col desc) as r
FROM tmp;
group_id | col | r |
---|---|---|
1 | 3 | 1 |
1 | 3 | 1 |
1 | 1 | 2 |
2 | 5 | 1 |
2 | 4 | 2 |
CUME_DIST() OVER()
返回分組內(nèi)累計(jì)分布值,即分組內(nèi)小于(或者大于)等于當(dāng)前值行數(shù)/分組內(nèi)總行數(shù)
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
col,
CUME_DIST() over(partition by group_id order by col asc) as d1,
CUME_DIST() over(partition by group_id order by col desc) as d2
FROM tmp;
group_id | col | d1 | d2 |
---|---|---|---|
1 | 3 | 1.0 | 0.6666666666666666 |
1 | 3 | 1.0 | 0.6666666666666666 |
1 | 1 | 0.3333333333333333 | 1.0 |
2 | 5 | 1.0 | 0.5 |
2 | 4 | 0.5 | 1.0 |
PERCENT_RANK() OVER()
返回百分比排序值脆荷,即分組內(nèi)當(dāng)前行的RANK值-1/分組內(nèi)總行數(shù)-1
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
col,
RANK() over(partition by group_id order by col asc) as r1,
PERCENT_RANK() over(partition by group_id order by col asc) as p1,
RANK() over(partition by group_id order by col desc) as r2,
PERCENT_RANK() over(partition by group_id order by col desc) as p2
FROM tmp;
group_id | col | r1 | p1 | r2 | p2 |
---|---|---|---|---|---|
1 | 3 | 2 | 0.5 | 1 | 0.0 |
1 | 3 | 2 | 0.5 | 1 | 0.0 |
1 | 1 | 1 | 0.0 | 3 | 1.0 |
2 | 5 | 2 | 1.0 | 1 | 0.0 |
2 | 4 | 1 | 0.0 | 2 | 1.0 |
NTILE(INTEGER x) OVER()
返回分區(qū)編號(hào)(將有序分區(qū)劃分為x個(gè)組凝垛,稱為bucket,并為分區(qū)中的每一行分配一個(gè)bucket編號(hào))
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
col,
NTILE(2) over(partition by group_id order by col asc) as bucket_id
FROM tmp;
group_id | col | bucket_id |
---|---|---|
1 | 1 | 1 |
1 | 3 | 1 |
1 | 3 | 2 |
1 | 3 | 2 |
2 | 4 | 1 |
2 | 5 | 2 |
2.3 OVER子句也支持聚合函數(shù)
Hive 2.1.0及之后版本简烘,OVER子句也支持聚合函數(shù)苔严,如:
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 5 AS col
)
SELECT group_id,
RANK() over(order by sum(col) desc) as r
FROM tmp
group by group_id;
結(jié)果為:
group_id | r |
---|---|
2 | 1 |
1 | 2 |
2.4 window clause 的另一種寫法
將window子句寫在from后面,在over后使用別名進(jìn)行引用孤澎,如下:
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 2 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
)
SELECT group_id,
col,
AVG(col) over w1 as avg1,
AVG(distinct col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as avg2
FROM tmp
WINDOW w1 AS (partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following);
結(jié)果為:
group_id | col | avg1 | avg2 |
---|---|---|---|
1 | 1 | 2.0 | 2.0 |
1 | 2 | 2.5 | 2.5 |
1 | 3 | 3.0 | 3.0 |
2 | 4 | 4.0 | 4.0 |
2 | 4 | 4.0 | 4.0 |
WITH tmp AS
(
SELECT 1 AS group_id, 1 AS col
UNION ALL SELECT 1 AS group_id, 2 AS col
UNION ALL SELECT 1 AS group_id, 3 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
UNION ALL SELECT 2 AS group_id, 4 AS col
)
SELECT group_id,
col,
AVG(col) over w1 as avg1,
AVG(distinct col) over w2 as avg2
FROM tmp
WINDOW w1 AS (partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following),
w2 AS (partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following);
結(jié)果為:
group_id | col | avg1 | avg2 |
---|---|---|---|
1 | 1 | 2.0 | 2.0 |
1 | 2 | 2.5 | 2.5 |
1 | 3 | 3.0 | 3.0 |
2 | 4 | 4.0 | 4.0 |
2 | 4 | 4.0 | 4.0 |