hive數(shù)據(jù)庫(kù)是hdfs上的文件夾,表也是文件夾乾蛤,表里的數(shù)據(jù)是文件
hive建表
create table t_student(id string,name string,age int,classNo string)
row format delimited
fields terminated by ',';
創(chuàng)建外部表
create external table t_a(id string,name string)
row format delimited
fields terminated by ','
location '/.../...'
外部表和內(nèi)部表的區(qū)別,drop時(shí)內(nèi)部表的hdfs數(shù)據(jù)一同刪除,外部表的hdfs上的數(shù)據(jù)不刪除
create table t_b(id string,name string)
row format delimited
fields terminated by ','
create table t_a(id string,name string)
row format delimited
fields terminated by ','
--加載數(shù)據(jù)從hdfs上加載
load data inpath '/dataB/b.txt' into table t_a;
--加載數(shù)據(jù)從本地上加載
load data local inpath '/root/bb.txt' into table t_b;
t_a
+---------+-----------+--+
| t_a.id | t_a.name |
+---------+-----------+--+
| a | 1 |
| b | 2 |
| c | 4 |
| d | 5 |
+---------+-----------+--+
t_b
+---------+-----------+--+
| t_b.id | t_b.name |
+---------+-----------+--+
| a | xx |
| b | yy |
| d | zz |
| e | pp |
+---------+-----------+--+
--笛卡爾積
select a.,b.
from t_a inner join t_b
+---------+-----------+---------+-----------+--+
| t_a.id | t_a.name | t_b.id | t_b.name |
+---------+-----------+---------+-----------+--+
| a | 1 | a | xx |
| b | 2 | a | xx |
| c | 4 | a | xx |
| d | 5 | a | xx |
| a | 1 | b | yy |
| b | 2 | b | yy |
| c | 4 | b | yy |
| d | 5 | b | yy |
| a | 1 | d | zz |
| b | 2 | d | zz |
| c | 4 | d | zz |
| d | 5 | d | zz |
| a | 1 | e | pp |
| b | 2 | e | pp |
| c | 4 | e | pp |
| d | 5 | e | pp |
+---------+-----------+---------+-----------+--+
--內(nèi)連接
select * from t_a join t_b where t_a.id = t_b.id;
select * from t_a join t_b on t_a.id = t_b.id;
--左外連接
select
a.,b.
from t_a a left join t_b b
select
a.,b.
from t_a a left join t_b b on a.id = b.id
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| a | 1 | a | xx |
| b | 2 | b | yy |
| d | 5 | d | zz |
| NULL | NULL | e | pp |
+-------+---------+-------+---------+--+
--右外連接
select
a.,b.
from t_a a right outer join t_b b
select
a.,b.
from t_a a right outer join t_b b on a.id = b.id
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| a | 1 | a | xx |
| b | 2 | b | yy |
| d | 5 | d | zz |
| NULL | NULL | e | pp |
+-------+---------+-------+---------+--+
--全外連接
select
a.,b.
from t_a a full outer join t_b b on a.id = b.id
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| a | 1 | a | xx |
| b | 2 | b | yy |
| c | 4 | NULL | NULL |
| d | 5 | d | zz |
| NULL | NULL | e | pp |
+-------+---------+-------+---------+--+
--左半連接 相當(dāng)于指定條件的內(nèi)連接,但是只顯示左邊的表數(shù)據(jù)
select
a.*
from t_a a left semi join t_b b on a.id = b.id
+-------+---------+--+
| a.id | a.name |
+-------+---------+--+
| a | 1 |
| b | 2 |
| d | 5 |
+-------+---------+--+
--例如有這些數(shù)據(jù)
vi access.log.0804
192.168.33.3,http://www.sina.com/stu,2017-08-04 15:30:20
192.168.33.3,http://www.sina.com/teach,2017-08-04 15:35:20
192.168.33.4,http://www.sina.com/stu,2017-08-04 15:30:20
192.168.33.4,http://www.sina.com/job,2017-08-04 16:30:20
192.168.33.5,http://www.sina.com/job,2017-08-04 15:40:20
vi access.log.0805
192.168.33.3,http://www.sina.com/stu,2017-08-05 15:30:20
192.168.44.3,http://www.sina.com/teach,2017-08-05 15:35:20
192.168.33.44,http://www.sina.com/stu,2017-08-05 15:30:20
192.168.33.46,http://www.sina.com/job,2017-08-05 16:30:20
192.168.33.55,http://www.sina.com/job,2017-08-05 15:40:20
vi access.log.0806
192.168.133.3,http://www.sina.com/register,2017-08-06 15:30:20
192.168.111.3,http://www.sina.com/register,2017-08-06 15:35:20
192.168.34.44,http://www.sina.com/pay,2017-08-06 15:30:20
192.168.33.46,http://www.sina.com/excersize,2017-08-06 16:30:20
192.168.33.55,http://www.sina.com/job,2017-08-06 15:40:20
192.168.33.46,http://www.sina.com/excersize,2017-08-06 16:30:20
192.168.33.25,http://www.sina.com/job,2017-08-06 15:40:20
192.168.33.36,http://www.sina.com/excersize,2017-08-06 16:30:20
192.168.33.55,http://www.sina.com/job,2017-08-06 15:40:20
--建分區(qū)表
create table t_access(ip string,url string,access_time string)
partitioned by (day string)
row format delimited
fields terminated by ','
--導(dǎo)入分區(qū)表數(shù)據(jù)
load data local inpath '/root/access.log.0806' into table t_access partition(day='2017-08-06');
--查看當(dāng)前分區(qū)情況
show partitions t_access;
--求每條url訪問(wèn)的次數(shù)
select
url,count(1)
from t_access
group by url;
--求每條url訪問(wèn)的ip中配乓,最大的ip是哪個(gè)
select
url,max(ip)
from t_access
group by url;
--求每個(gè)ip訪問(wèn)同一個(gè)頁(yè)面的記錄中,最晚的一條
select
ip,url,max(access_time)
from t_access
group by ip,url
--求PV
select
url,count(1)
from t_access
group by url
+--------------------------------+------+--+
| url | _c1 |
+--------------------------------+------+--+
| http://www.sina.com/excersize | 3 |
| http://www.sina.com/job | 7 |
| http://www.sina.com/pay | 1 |
| http://www.sina.com/register | 2 |
| http://www.sina.com/stu | 4 |
| http://www.sina.com/teach | 2 |
+--------------------------------+------+--+
--求UV
select
url,count(distinct(ip))
from t_access
group by url
+--------------------------------+-----+--+
| url | c1 |
+--------------------------------+-----+--+
| http://www.sina.com/excersize | 2 |
| http://www.sina.com/job | 5 |
| http://www.sina.com/pay | 1 |
| http://www.sina.com/register | 2 |
| http://www.sina.com/stu | 3 |
| http://www.sina.com/teach | 2 |
+--------------------------------+-----+--+
--求每條url訪問(wèn)的ip中,最大的ip是哪個(gè)
--PV
--UV
--用mapreduce實(shí)現(xiàn)
--求8月6號(hào)pv扰付,uv
select
count(1),url
from t_access
where day = '2017-08-06'
group by url
--求8月4號(hào)以后,每天訪問(wèn)http://www.sina.com/job 的次數(shù)仁讨,以及訪問(wèn)者中ip最大的
select
count(1),max(ip),day
from t_access
where day > '2017-08-04'
and url = 'http://www.sina.com/job'
group by day
select
count(1),max(ip),day
from t_access
where url = 'http://www.sina.com/job'
group by day having day > '2017-08-04'
--求8月4號(hào)以后羽莺,每天每個(gè)頁(yè)面訪問(wèn)總次數(shù),以及頁(yè)面最大的ip洞豁,并且上述查詢結(jié)果中訪問(wèn)次數(shù)大于2的
select tmp.* from
(select
url,day,count(1) cnts,max(ip) ip
from t_access
group by day,url having day > '2017-08-04') tmp
where tmp.cnts > 2
--每天盐固,pv排序
select
url,day,count(1) cnts
from t_access
group by day,url
order by cnts desc
--hive里的order by是全局排序
--sort by是在map里進(jìn)行排序
select
tmp.*
from
(select
url,day,count(1) cnts
from t_access
group by day,url) tmp
distribute by tmp.day sort by tmp.cnts desc
--有如下數(shù)據(jù)
+-------------+---------------+--+
| t_sale.mid | t_sale.money |
+-------------+---------------+--+
| AA | 15.0 |
| AA | 20.0 |
| BB | 22.0 |
| CC | 44.0 |
+-------------+---------------+--+
--distribute by先把數(shù)據(jù)分發(fā)到各個(gè)reduce中,然后sort by在各個(gè)reduce中進(jìn)行局部排序
select
mid,money
from t_sale
distribute by mid sort by money desc
+------+--------+--+
| mid | money |
+------+--------+--+
| AA | 15.0 |
| AA | 20.0 |
| BB | 22.0 |
| CC | 44.0 |
+------+--------+--+
--cluster by mid 等于 distribute by mid sort by mid
--cluster by后面不能跟desc丈挟,asc刁卜,默認(rèn)的只能升序
--order by 是全排序,所有的數(shù)據(jù)會(huì)發(fā)送給一個(gè)reduceTask進(jìn)行處理曙咽,在數(shù)據(jù)量大的時(shí)候蛔趴,reduce就會(huì)超負(fù)荷
select
mid,money
from t_sale
order by money desc
--設(shè)置最大的reduce啟動(dòng)個(gè)數(shù)
set hive.exec.reducers.max=10;
--設(shè)置reduce的啟動(dòng)個(gè)數(shù)
set mapreduce.job.reduce=3
--hive數(shù)據(jù)類型
--數(shù)字類型
tinyint 1byte -128到127
smallint 2byte -32768到32767 char
int
bigint
float
double
--日期類型
timestamp
date
--字符串類型
string
varchar
char
--混雜類型
boolean
binary 二進(jìn)制
--復(fù)合類型
--數(shù)組
流浪地球,吳京:吳孟達(dá):張飛:趙云,2019-09-11
我和我的祖國(guó),葛優(yōu):黃渤:宋佳:吳京,2019-10-01
create table t_movie(movie_name string,actors array<string>,show_time date)
row format delimited fields terminated by ','
collection items terminated by ':';
array_contains
select
from t_movie
where array_contains(actors,'張飛')
size
select
movie_name,actors,show_time,size(actors) num
from t_movie
========================================
map類型
1,zhangsan,father:xiaoming#mother:chentianxing#brother:shaoshuai,28
2,xiaoming,father:xiaoyun#mother:xiaohong#sister:xiaoyue#wife:chentianxing,30
create table t_user(id int,name string,family map<string,string>,jage int)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
map_keys
select
id,name,map_keys(family) as relation,jage
from t_user;
map_values
select
id,name,map_values(family) as relation,jage
from t_user;
--家庭成員數(shù)量
select
id,name,size(family),jage
from t_user
--有兄弟的人
select
from t_user
where array_contains(map_keys(family),'brother')
========================================
1,zhangsan,18:man:beijing
2,lisi,22:woman:shanghai
create table t_people(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';
select
id,name,info.age
from t_people;
========================================
192,168,33,66,zhangsan,male
192,168,33,77,lisi,male
192,168,43,101,wangwu,female
create table t_people_ip(ip_seg1 string,ip_seg2 string,ip_seg3 string,ip_seg4 string,name string,sex string)
row format delimited fields terminated by ',';
--字符串拼接函數(shù)
concat
select concat('abc','def')
concat_ws
select concat_ws('.',ip_seg1,ip_seg2,ip_seg3,ip_seg4),name,sex from t_people_ip
===================================================================
--求字符串長(zhǎng)度
length
select lenth('jsdfijsdkfjkdsfjkdf');
========================================
select to_date('2019-09-11 16:55:11');
--把字符串轉(zhuǎn)換成unix時(shí)間戳
select unix_timestamp('2019-09-11 11:55:11','yyyy-MM-dd HH:mm:ss');
--把unix時(shí)間戳轉(zhuǎn)換成字符串
select from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss');
========================================
--數(shù)學(xué)運(yùn)算函數(shù)
--四舍五入
select round(5.4)
--四舍五入保留2位小數(shù)
select round(5.1345,2)
--向上取整
select ceil(5.3)
--向下取整
select floor(5.3)
--取絕對(duì)值
select abs(-5.2)
--取最大值
select greatest(3,4,5,6,7)
--取最小值
select least(3,4,5,6,7)
==================================================
1,19,a,male
2,19,b,male
3,22,c,female
4,16,d,female
5,30,e,male
6,26,f,female
+---------------+----------------+-----------------+----------------+--+
| t_student.id | t_student.age | t_student.name | t_student.sex |
+---------------+----------------+-----------------+----------------+--+
| 1 | 19 | a | male |
| 2 | 19 | b | male |
| 5 | 30 | e | male |
| 3 | 22 | c | female |
| 4 | 16 | d | female |
| 6 | 26 | f | female |
+---------------+----------------+-----------------+----------------+--+
row_number() over()
select
id,age,name,sex,row_number() over(partition by sex order by age desc) rk
from t_student
select
tmp.*
from
(select
id,age,name,sex,row_number() over(partition by sex order by age desc) rk
from t_student) tmp
where tmp.rk = 1
select
id,age,name,sex,rank() over(partition by sex order by age desc) rk
from t_student
select
id,age,name,sex,dense_rank() over(partition by sex order by age desc) rk
from t_student
========================================
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
C,2015-01,10
C,2015-01,20
A,2015-02,4
A,2015-02,6
C,2015-02,30
C,2015-02,10
B,2015-02,10
B,2015-02,5
A,2015-03,14
A,2015-03,6
B,2015-03,20
B,2015-03,25
C,2015-03,10
C,2015-03,20
--求每個(gè)用戶每個(gè)月的銷售額和到當(dāng)月位置的累計(jì)銷售額
create table t_saller(name string,month string,amount int)
row format delimited fields terminated by ','
create table t_accumulate
as
select name,month,sum(amount) samount
from t_saller
group by name,month
+--------------------+---------------------+-----------------------+--+
| t_accumulate.name | t_accumulate.month | t_accumulate.samount |
+--------------------+---------------------+-----------------------+--+
| A | 2015-01 | 33 |
| A | 2015-02 | 10 |
| A | 2015-03 | 20 |
| B | 2015-01 | 30 |
| B | 2015-02 | 15 |
| B | 2015-03 | 45 |
| C | 2015-01 | 30 |
| C | 2015-02 | 40 |
| C | 2015-03 | 30 |
+--------------------+---------------------+-----------------------+--+
--最前面一行
select
name,month,samount,sum(samount) over(partition by name order by month rows between unbounded preceding and current row) accumlateAmount
from
t_accumulate
| A | 2015-01 | 33 |
| A | 2015-02 | 10 |
| A | 2015-03 | 20 |
| A | 2015-04 | 30 |
| A | 2015-05 | 15 |
| A | 2015-06 | 45 |
| A | 2015-07 | 30 |
| A | 2015-08 | 40 |
| A | 2015-09 | 30 |
select
name,month,samount,sum(samount) over(partition by name order by month rows between 2 preceding and 1 following ) accumlateAmount
from
t_accumulate
preceding |
當(dāng)前行 | 窗口長(zhǎng)度
following |
min() over() ,max() over() , avg() over()
========================================
1,zhangsan,化學(xué):物理:數(shù)學(xué):語(yǔ)文
2,lisi,化學(xué):數(shù)學(xué):生物:生理:衛(wèi)生
3,wangwu,化學(xué):語(yǔ)文:英語(yǔ):體育:生物
create table t_stu_subject(id int,name string,subjects array<string>)
row format delimited fields terminated by ','
collection items terminated by ':'
explode()
select explode(subjects) from t_stu_subject;
select distinct tmp.subs from (select explode(subjects) subs from t_stu_subject) tmp
========================================
lateral view連接函數(shù)
select
id,name,sub
from
t_stu_subject lateral view explode(subjects) tmp as sub
+-----+-----------+------+--+
| id | name | sub |
+-----+-----------+------+--+
| 1 | zhangsan | 化學(xué) |
| 1 | zhangsan | 物理 |
| 1 | zhangsan | 數(shù)學(xué) |
| 1 | zhangsan | 語(yǔ)文 |
| 2 | lisi | 化學(xué) |
| 2 | lisi | 數(shù)學(xué) |
| 2 | lisi | 生物 |
| 2 | lisi | 生理 |
| 2 | lisi | 衛(wèi)生 |
| 3 | wangwu | 化學(xué) |
| 3 | wangwu | 語(yǔ)文 |
| 3 | wangwu | 英語(yǔ) |
| 3 | wangwu | 體育 |
| 3 | wangwu | 生物 |
+-----+-----------+------+--+
========================================
wordcount
words
create table words(line string)
line
hello world hi tom and jack
hello chentianxing qiaoyuan and shaoshuai
hello hello hi tom and shaoshuai
chentianxing love saoshuai
hello love what is love how love
split()
select
tmp.word,count(1) cnts
from
(select
explode(split(line,' ')) word
from words) tmp
group by tmp.word order by cnts desc
========================================
--炸map
select
id,name,key,value
from
t_user
lateral view explode(family) tmp as key,value
========================================
--有web系統(tǒng),每天產(chǎn)生下列的日志文件
2017-09-15號(hào)的數(shù)據(jù):
192.168.33.6,hunter,2017-09-15 10:30:20,/a
192.168.33.7,hunter,2017-09-15 10:30:26,/b
192.168.33.6,jack,2017-09-15 10:30:27,/a
192.168.33.8,tom,2017-09-15 10:30:28,/b
192.168.33.9,rose,2017-09-15 10:30:30,/b
192.168.33.10,julia,2017-09-15 10:30:40,/c
2017-09-16號(hào)的數(shù)據(jù):
192.168.33.16,hunter,2017-09-16 10:30:20,/a
192.168.33.18,jerry,2017-09-16 10:30:30,/b
192.168.33.26,jack,2017-09-16 10:30:40,/a
192.168.33.18,polo,2017-09-16 10:30:50,/b
192.168.33.39,nissan,2017-09-16 10:30:53,/b
192.168.33.39,nissan,2017-09-16 10:30:55,/a
192.168.33.39,nissan,2017-09-16 10:30:58,/c
192.168.33.20,ford,2017-09-16 10:30:54,/c
2017-09-17號(hào)的數(shù)據(jù):
192.168.33.46,hunter,2017-09-17 10:30:21,/a
192.168.43.18,jerry,2017-09-17 10:30:22,/b
192.168.43.26,tom,2017-09-17 10:30:23,/a
192.168.53.18,bmw,2017-09-17 10:30:24,/b
192.168.63.39,benz,2017-09-17 10:30:25,/b
192.168.33.25,haval,2017-09-17 10:30:30,/c
192.168.33.10,julia,2017-09-17 10:30:40,/c
--統(tǒng)計(jì)日活躍用戶
--統(tǒng)計(jì)每日新增用戶
--統(tǒng)計(jì)歷史用戶
--創(chuàng)建日志表
create table t_web_log(ip string,uname string,access_time string,url string)
partitioned by (day string)
row format delimited fields terminated by ',';
--創(chuàng)建日活躍用戶表
create table t_user_active_day
like
t_web_log;
--統(tǒng)計(jì)日活躍用戶
insert into table t_user_active_day partition(day='2017-09-15')
select tmp.ip,tmp.uname,tmp.access_time,tmp.url
from
(select
ip,uname,access_time,url,row_number() over(partition by uname order by access_time) rn
from
t_web_log where day='2017-09-15') tmp
where rn <2
--創(chuàng)建歷史用戶表
create table t_user_history(uname string)
--創(chuàng)建日新增用戶表
create table t_user_new_day like t_user_active_day;
--統(tǒng)計(jì)日新增用戶
insert into table t_user_new_day partition(day='2017-09-15')
select
tua.ip,tua.uname,tua.access_time,tua.url
from t_user_active_day tua
left join t_user_history tuh on tua.uname = tuh.uname
where tua.day = '2017-09-15' and tuh.uname IS NULL
--記錄歷史用戶
insert into table t_user_history
select
uname
from t_user_new_day
where day='2017-09-15'
insert into table t_user_active_day partition(day='2017-09-16')
select tmp.ip,tmp.uname,tmp.access_time,tmp.url
from
(select
ip,uname,access_time,url,row_number() over(partition by uname order by access_time) rn
from
t_web_log where day='2017-09-16') tmp
where rn <2
insert into table t_user_new_day partition(day='2017-09-16')
select
tua.ip,tua.uname,tua.access_time,tua.url
from t_user_active_day tua
left join t_user_history tuh on tua.uname = tuh.uname
where tua.day = '2017-09-16' and tuh.uname IS NULL
insert into table t_user_history
select
uname
from t_user_new_day
where day='2017-09-16'
--桶表
--創(chuàng)建桶表例朱,按id分成三個(gè)桶
create table t1(id int,name string,age int)
clustered by(id) into 3 buckets
row format delimited fields terminated by ',';
--桶表插入數(shù)據(jù)的方式
create table t1_1(id int,name string,age int)
row format delimited fields terminated by ',';
set hive.enforce.bucketing=true;
set mapreduce.job.reduces=3;
insert into t1 select * from t1_1;
========================================
hive自定義函數(shù)
package com.bwie.hive.udf;
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.hive.ql.exec.UDF;
public class JsonUDF extends UDF {
public String evaluate(String json,String colum){
JSONObject jsonObject = JSONObject.parseObject(json);
String value = jsonObject.getString(colum);
return value;
}
}
打包上傳
add jar /root/hivetest-1.0-SNAPSHOT.jar;
create temporary function getjson as 'com.bwie.hive.udf.JsonUDF';
select
getjson(json,'movie') as movie,getjson(json,'rate') as rate,from_unixtime(CAST(getjson(json,'timeStamp') as BIGINT),'yyyy-MM-dd HH:mm:ss') as showtime,getjson(json,'uid') as uid
from t_ex_rat limit 10;
--創(chuàng)建永久函數(shù)
方法一孝情、
在hive-site.xml里添加jar包
<property>
<name>hive.aux.jars.path</name>
<value>file:///usr/local/apache-hive-1.2.2-bin/lib/hivetest-1.0-SNAPSHOT.jar</value>
</property>
疑似jar包要放到lib下才能生效
create function getjson AS 'com.bwie.hive.udf.JsonUDF'
方法二、
create function getjson as 'com.bwie.hive.udf.JsonUDF' using jar 'hdfs://hdp1703/lib/hivetest-1.0-SNAPSHOT.jar'
drop function getjson
========================================
create table t_employee(id int,name string,age int)
partitioned by(state string,city string)
row format delimited fields terminated by ','
create table t_employee_orgin(id int,name string,age int,state string,city string)
row format delimited fields terminated by ',';
load data local inpath '/root/employ.txt' into table t_employee_orgin;
在嚴(yán)格模式下洒嗤,多個(gè)分區(qū)一定有一個(gè)分區(qū)是固定的
insert into t_employee
partition(state='china',city)
select id,name,age,city
from t_employee_orgin
where state='china'
--非嚴(yán)格模式
set hive.exec.dynamic.partition.mode=nostrict;
insert into t_employee
partition(state,city)
select id,name,age,state,city
from t_employee_orgin
========================================
select to_date('2019-09-20 17:30:00')
unix_timestamp('2019/09/20 17:30:00','yyyy/MM/dd HH:mm:ss')
select from_unixtime(unix_timestamp('2019/09/20 17:30:00','yyyy/MM/dd HH:mm:ss'),'yyyy-MM-dd HH:mm:ss')