Hive中SQL的優(yōu)化技巧,核心思想是避免數(shù)據(jù)傾斜守呜。
1粱玲、避免在同一個查詢中同時出現(xiàn)count, distinct,group by
2、left join 時把小數(shù)據(jù)量的表放在前面
3昧旨、盡量使用子查詢
參數(shù)配置
SET mapred.reduce.tasks=50;
SET mapreduce.reduce.memory.mb=6000;
SET mapreduce.reduce.shuffle.memory.limit.percent=0.06;
涉及數(shù)據(jù)傾斜的話拾给,主要是reduce中數(shù)據(jù)傾斜的問題祥得,可能通過設置hive中reduce的并行數(shù),reduce的內(nèi)存大小單位為m蒋得,reduce中 shuffle的刷磁盤的比例级及,來解決。
實例一
--分月
select substr(a.day,1,6)month,count(distinct a.userid)
from dms.tracklog_5min a
join default.site_activeuser_tmp c
on a.userid=c.id
where a.day>='201505' and a.day<'201506'
group by substr(a.day,1,6) ;
--優(yōu)化后
select '201505',count(*) from
(
select distinct c.userid
from
(select userid from default.site_activeuser_tmp where month='201505') c
left join
(
select userid from
dms.tracklog_5min
where day>='201505' and day<'201506'
) tmp
on tmp.userid=c.userid
) t;
實例二
--分事業(yè)部
select substr(a.day,1,6)month,count(distinct a.userid) ,b.dept_name
from dms.tracklog_5min a join default.d_channel b
on a.host=b.host
join default.site_activeuser_tmp c
on a.userid=c.id
where a.day>='201505' and a.day<'201506'
group by substr(a.day,1,6),b.dept_name;
--優(yōu)化后
SET mapred.reduce.tasks=50;
SET mapreduce.reduce.memory.mb=6000;
SET mapreduce.reduce.shuffle.memory.limit.percent=0.06;
select "201505" month,count(t.userid),t.dept_name
from
(select userid from default.site_activeuser_tmp where month='201505') c
left join
(
select distinct a.userid userid,b.dept_name dept_name from default.d_channel b
left join
(select host,userid from dms.tracklog_5min where day>='201505' and day<'201506' ) a
on a.host=b.host
)t
on t.userid=c.userid
group by t.dept_name ;
實例三
--分產(chǎn)品
select substr(a.day,1,6)month,count(distinct a.userid) ,b.dept_name,b.prod_name
from dms.tracklog_5min a join default.d_channel b
on a.host=b.host
join default.site_activeuser_tmp c
on a.userid=c.id
where a.day>='201505' and a.day<'201506'
group by substr(a.day,1,6),b.dept_name,b.prod_name;
--優(yōu)化后
select "201505" month,count(t.userid) cnt,t.dept_name dept_name,t.prod_name prod_name
from
(select userid from default.site_activeuser_tmp where month='201505') c
left join
(
select distinct a.userid userid,b.dept_name dept_name,b.prod_name prod_name from default.d_channel b
left join
(select host,userid from dms.tracklog_5min where day>='201505' and day<'201506' ) a
on a.host=b.host
)t
on t.userid=c.userid
group by t.prod_name,t.dept_name ;