hive基本語法

DDL數據定義

創(chuàng)建數據庫：

create databases if not exists db_hive;
create datebases if not exists db_hive localtion `db_hive2.db`

修改數據庫:

alter database db_hive set dbproperties(`createtime`=`20170101`)
//用戶可以用alter database命令為數據庫設置dbrpoprities屬性值
//此為屬性信息土童，數據庫的表名和數據庫所在目錄位置不可以修改

顯示數據庫:

show databases;//顯示數據庫
show databases like 'db_hive*';//過濾顯示查詢數據庫
desc database db_hive;//顯示數據庫信息
desc database extended db_hive;//顯示數據庫詳細信息 
use database db_hive;//使用數據庫

刪除數據庫

drop database db_hive2;//刪除空數據庫
drop database if exists db_hive2;//判斷數據庫是否存在
drop database db_hive2 cascade;//數據庫不為空 用cascade強制刪除

創(chuàng)建表

//創(chuàng)建普通表
create table if not exists student2(
id int,name string
)
row format delimited fields terminated by '\t'
stored as textfile
location '/user/hive/warehouse/student2';

//未被external修飾的是內部表（managed table），被external修飾的為外部表（external table）刪除外部表的話只會刪除元數據浦楣，數據不會被刪除效览； 
create external table if not exists default.dept(
deptno int,
dname string,
loc int
)
row format delimited fields terminated by '\t';

分區(qū)表

創(chuàng)建分區(qū)表
hive (default)> create table dept_partition(
               deptno int, dname string, loc string
               )
               partitioned by (month string)
               row format delimited fields terminated by '\t';

加載數據到分區(qū)表
hive (default)> load data local inpath '/opt/module/datas/dept.txt' into table default.dept_partition partition(month='201709');
hive (default)> load data local inpath '/opt/module/datas/dept.txt' into table default.dept_partition partition(month='201708');
hive (default)> load data local inpath '/opt/module/datas/dept.txt' into table default.dept_partition partition(month='201707');

查詢分區(qū)表數據
hive (default)> select * from dept_partition where month='201709'
              union
              select * from dept_partition where month='201708'
              union
              select * from dept_partition where month='201707';

增加分區(qū)
hive (default)>  alter table dept_partition add partition(month='201705') partition(month='201704');

刪除分區(qū)
hive (default)> alter table dept_partition drop partition (month='201705'), partition (month='201706');

查看分區(qū)表有多少分區(qū)
hive>show partitions dept_partition;

查看分區(qū)表結構
hive>desc formatted dept_partition;

# Partition Information          
# col_name              data_type               comment             
month                   string
    
//加載數據到分區(qū)
hive (default)> load data local inpath '/opt/module/datas/dept.txt' into table default.dept_partition2 partition(month='201709', day='13');
hive (default)> select * from dept_partition2 where month='201709' and day='13';

//把數據直接上傳到分區(qū)目錄上孽拷，讓分區(qū)表和數據產生關聯的三種方式
方式一：上傳數據后修復
hive (default)> dfs -mkdir -p /user/hive/warehouse/dept_partition2/month=201709/day=12;
hive (default)> dfs -put /opt/module/datas/dept.txt  /user/hive/warehouse/dept_partition2/month=201709/day=12;
hive (default)> msck repair table dept_partition2;
hive (default)> select * from dept_partition2 where month='201709' and day='12';
方式二：上傳數據后添加分區(qū)
hive (default)> dfs -mkdir -p /user/hive/warehouse/dept_partition2/month=201709/day=11;
hive (default)> dfs -put /opt/module/datas/dept.txt  /user/hive/warehouse/dept_partition2/month=201709/day=11;
hive (default)> alter table dept_partition2 add partition(month='201709', day='11');
hive (default)> select * from dept_partition2 where month='201709' and day='11';
方式三：上傳數據后load數據到分區(qū)
hive (default)> dfs -mkdir -p /user/hive/warehouse/dept_partition2/month=201709/day=10;
hive (default)> load data local inpath '/opt/module/datas/dept.txt' into table dept_partition2 partition(month='201709',day='10');
hive (default)> select * from dept_partition2 where month='201709' and day='10';

修改表
//重命名表名
hive (default)> alter table dept_partition2 rename to dept_partition3;
//添加列
hive (default)> alter table dept_partition add columns(deptdesc string);
//更新列
hive (default)> alter table dept_partition change column deptdesc desc int;
//替換列
hive (default)> alter table dept_partition replace columns(deptno string, dname string, loc string);
//刪除表
hive (default)> drop table dept_partition;
/

DML數據操作

數據導入
//向表中裝載數據（Load）
hive>load data [local] inpath '/opt/module/datas/student.txt' [overwrite] into table student [partition (partcol1=val1,…)];
//（1）load data:表示加載數據
//（2）local:表示從本地加載數據到hive表；否則從HDFS加載數據到hive表
//（3）inpath:表示加載數據的路徑
//（4）overwrite:表示覆蓋表中已有數據亿汞，否則表示追加
//（5）into table:表示加載到哪張表
//（6）student:表示具體的表
//（7）partition:表示上傳到指定分區(qū)

//通過查詢語句向表中插入數據（Insert）
hive (default)> from student
              insert overwrite table student partition(month='201707')
              select id, name where month='201709'
              insert overwrite table student partition(month='201706')
              select id, name where month='201709';

// 查詢語句中創(chuàng)建表并加載數據（As Select）
//根據查詢結果創(chuàng)建表（查詢的結果會添加到新創(chuàng)建的表中）
create table if not exists student3
as select id, name from student;

//創(chuàng)建表時通過Location指定加載數據路徑
hive (default)> create table if not exists student5(
              id int, name string
              )
              row format delimited fields terminated by '\t'
              location '/user/hive/warehouse/student5';
hive (default)> dfs -put /opt/module/datas/student.txt  /user/hive/warehouse/student5;
hive (default)> select * from student5;

//Import數據到指定Hive表中
hive (default)> import table student2 partition(month='201709') from '/user/hive/warehouse/export/student';

數據導出
//insert導入
hive (default)> insert overwrite local directory '/opt/module/datas/export/student'  select * from student;
//將查詢結果格式化導入
hive (default)> insert overwrite local directory '/opt/module/datas/export/student1'
             ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
             select * from student;
//將查詢的結果導出到HDFS上(沒有l(wèi)ocal)
hive (default)> insert overwrite directory '/user/atguigu/student2'
             ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
             select * from student;
//Hadoop命令導出到本地
hive (default)> dfs -get /user/hive/warehouse/student/month=201709/000000_0  /opt/module/datas/export/student3.txt;
//Hive Shell 命令導出
[atguigu@hadoop102 hive]$ bin/hive -e 'select * from default.student;' > /opt/module/datas/export/student4.txt;
//Export導出到HDFS上
hive (default)> export table default.student to '/user/hive/warehouse/export/student';

清除表中數據（Truncate）
注意：Truncate只能刪除管理表羔沙，不能刪除外部表中數據
hive (default)> truncate table student;

查詢

基本查詢（Select…From） 
//全表和特定字段查詢
hive (default)> select * from emp;
hive (default)> select empno, ename from emp;
//列別名
hive (default)> select ename AS name, deptno dn from emp;
//算術運算符
//查詢出所有員工的薪水后加1顯示。
hive (default)> select sal +1 from emp;
//常用函數
hive (default)> select count(*) cnt from emp;總行數
hive (default)> select max(sal) max_sal from emp;最大值
hive (default)> select min(sal) min_sal from emp;最小值
hive (default)> select sum(sal) sum_sal from emp;總和
hive (default)> select avg(sal) avg_sal from emp;平均值
hive (default)> select * from emp limit 5; limit語句

Where語句
//查詢出薪水大于1000的所有員工
hive (default)> select * from emp where sal >1000;
//比較運算符（Between/In/ Is Null）
hive (default)> select * from emp where sal =5000;
hive (default)> select * from emp where sal between 500 and 1000;
hive (default)> select * from emp where comm is null;
hive (default)> select * from emp where sal IN (1500, 5000);
// Like和RLike
//RLIKE子句是Hive中這個功能的一個擴展伟骨，其可以通過Java的正則表達式這個更強大的語言來指定匹配條件饮潦。
//查找以2開頭薪水的員工信息
hive (default)> select * from emp where sal LIKE '2%';
//找第二個數值為2的薪水的員工信息
hive (default)> select * from emp where sal LIKE '_2%';
//查找薪水中含有2的員工信息
hive (default)> select * from emp where sal RLIKE '[2]';
邏輯運算符（And/Or/Not）
hive (default)> select * from emp where sal>1000 and deptno=30;
hive (default)> select * from emp where sal>1000 or deptno=30;
hive (default)> select * from emp where deptno not IN(30, 20);

分組
Group By語句
//計算emp表每個部門的平均工資
hive (default)> select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;
//計算emp每個部門中每個崗位的最高薪水
hive (default)> select t.deptno, t.job, max(t.sal) max_sal from emp t group by t.deptno, t.job;

Having語句
having與where不同點
//where針對表中的列發(fā)揮作用，查詢數據携狭；having針對查詢結果中的列發(fā)揮作用继蜡，篩選數據。
//having只用于group by分組統(tǒng)計語句逛腿。

求每個部門的平均工資
hive (default)> select deptno, avg(sal) from emp group by deptno;

//求每個部門的平均薪水大于2000的部門
hive (default)> select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;

Join語句
//等值Join,Hive支持通常的SQL JOIN語句稀并，但是只支持等值連接，不支持非等值連接单默。
//根據員工表和部門表中的部門編號相等碘举，查詢員工編號、員工名稱和部門編號搁廓；
hive (default)> select e.empno, e.ename, d.deptno, d.dname from emp e join dept d on e.deptno = d.deptno;
//合并員工表和部門表
hive (default)> select e.empno, e.ename, d.deptno from emp e join dept d on e.deptno = d.deptno;
//內連接：只有進行連接的兩個表中都存在與連接條件相匹配的數據才會被保留下來引颈。
hive (default)> select e.empno, e.ename, d.deptno from emp e join dept d on e.deptno = d.deptno;
//左外連接：JOIN操作符左邊表中符合WHERE子句的所有記錄將會被返回。
hive (default)> select e.empno, e.ename, d.deptno from emp e left join dept d on e.deptno = d.deptno;
//右外連接：JOIN操作符右邊表中符合WHERE子句的所有記錄將會被返回境蜕。
hive (default)> select e.empno, e.ename, d.deptno from emp e right join dept d on e.deptno = d.deptno;
//滿外連接：將會返回所有表中符合WHERE語句條件的所有記錄蝙场。如果任一表的指定字段沒有符合條件的值的話，那么就使用NULL值替代粱年。
hive (default)> select e.empno, e.ename, d.deptno from emp e full join dept d on e.deptno = d.deptno;

//大多數情況下售滤，Hive會對每對JOIN連接對象啟動一個MapReduce任務。本例中會首先啟動一個MapReduce job對表e和表d進行連接操作逼泣，然后會再啟動一個MapReduce job將第一個MapReduce job的輸出和表l;進行連接操作趴泌。
//注意：為什么不是表d和表l先進行連接操作呢舟舒？這是因為Hive總是按照從左到右的順序執(zhí)行的。
hive (default)>SELECT e.ename, d.deptno, l. loc_name
FROM   emp e 
JOIN   dept d
ON     d.deptno = e.deptno 
JOIN   location l
ON     d.loc = l.loc;

//笛卡爾積 JOIN
hive (default)> select empno, deptno from emp, dept;
FAILED: SemanticException Column deptno Found in more than One Tables/Subqueries

排序
//全局排序（Order By）：全局排序嗜憔，一個MapReduce
//查詢員工信息按工資升序排列
hive (default)> select * from emp order by sal;
//查詢員工信息按工資降序排列
hive (default)> select * from emp order by sal desc;
//按照員工薪水的2倍排序
hive (default)> select ename, sal*2 twosal from emp order by twosal;
//按照部門和工資升序排序
hive (default)> select ename, deptno, sal from emp order by deptno, sal ;

//每個MapReduce內部排序（Sort By）:
//Sort By：每個MapReduce內部進行排序秃励，對全局結果集來說不是排序
//根據部門降序查看員工信息
hive (default)> select * from emp sort by empno desc;

//分區(qū)排序（Distribute By）：類似MR中partition，進行分區(qū)吉捶，結合sort by使用夺鲜。
//注意，Hive要求DISTRIBUTE BY語句要寫在SORT BY語句之前呐舔。
//（1）先按照部門編號分區(qū)币励，再按照員工編號降序排序。
hive (default)> insert overwrite local directory '/opt/module/datas/distby-desc' select * from emp distribute by deptno sort by empno desc;

Cluster By
//當distribute by和sorts by字段相同時珊拼，可以使用cluster by方式食呻。
//cluster by除了具有distribute by的功能外還兼具sort by的功能。但是排序只能是倒序排序澎现，不能指定排序規(guī)則為ASC或者DESC仅胞。
select * from emp cluster by deptno;等價于
select * from emp distribute by deptno sort by deptno;

分桶及抽樣查詢
//分區(qū)針對的是數據的存儲路徑；分桶針對的是數據文件剑辫。