一搅幅、思維導圖
二溯职、準備工作
2.1使用工具
由于hive安裝復雜,本項目使用金融數據分析案例第四篇《Hive初步學習》提供的【數據蛙環(huán)境(linux服務地址:106.13.128.83、用戶名:froghd、密碼:暫不透露)】進行操作琐鲁。
操作工具主要為Xshell.6.0.0121.7z【優(yōu)點:更便于使用linux服務】。
2.2數據下載
數據下載鏈接
下載完成后围段,我放在桌面。
2.3數據描述
三份數據如下(均為 .dat 文件):
1挡毅、users.dat
數據格式為: 2::M::56::16::70072蒜撮,共有6040條數據
對應字段為:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
對應字段中文解釋:用戶id,性別跪呈,年齡,職業(yè)取逾,郵政編碼
2耗绿、movies.dat
數據格式為: 2::Jumanji (1995)::Adventure|Children's|Fantasy,共有3883條數據
對應字段為:MovieID BigInt, Title String, Genres String
對應字段中文解釋:電影ID砾隅,電影名字误阻,電影類型
3、ratings.dat
數據格式為: 1::1193::5::978300760晴埂,共有1000209條數據
對應字段為:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
對應字段中文解釋:用戶ID,電影ID,評分竭缝,評分時間戳
2.4數據上傳至linux服務器
cd /home/mike -- 然后輸入左側代碼,回車后顯示如下圖
sudo rz
-- 如果輸入rz報錯琅锻,說明你沒有權限卦停;此時需要通過輸入sudo rz來
-- sudo允許一個已授權用戶以超級用戶角色來訪問
三拇派、十題操作
1、正確建表凿跳,導入數據(三張表件豌,三份數據),并驗證是否正確
創(chuàng)建一個數據庫wt拄显,在wt數據庫中創(chuàng)建3張表苟径,t_user,t_movie躬审,t_rating
t_user : userid bigint,sex string,age int,occupation string,zipcode string
t_movie : movieid bigint,moviename string,movietype string
t_rating : userid bigint,movieid bigint,rate double,times string
原始數據是以::進行切分的棘街,所以需要使用能解析多字節(jié)分隔符的Serde即可
使用RegexSerde
需要兩個參數:
input.regex = "(.)::(.)::(.*)"
output.format.string = "%1s %3$s"
-- 創(chuàng)建數據庫
drop database if exists wt;
create database if not exists wt;
use wt; -- 此步不可少!
-- 創(chuàng)建表t_user
create table t_user(
userid bigint,
sex string,
age int,
occupation string,
zipcode string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;
-- 創(chuàng)建表t_movie
create table t_movie(
movieid bigint,
moviename string,
movietype string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;
-- 創(chuàng)建表t_rating
create table t_rating(
userid bigint,
movieid bigint,
rate double,
times string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;
-- 導入數據
load data local inpath "/home/mike/users.dat" into table t_user;
-- 驗證
select t.* from t_user t;
2承边、求被評分次數最多的10部電影遭殉,并給出評分次數(電影名,評分次數)
select b.moviename as moviename,count(a.rate) as total from t_rating a
join t_movie b
on a.movieid=b.movieid
group by moviename --【此處是注釋】或者b.moviename
order by total desc
limit 10;
3博助、分別求男性险污,女性當中評分最高的10部電影(性別,電影名富岳,影評分)
select a.sex as sex,c.moviename as moviename,avg(b.rate) as avgrate,count(c.moviename) as total from t_user a
join t_rating b on a.userid=b.userid
join t_movie c on c.movieid=b.movieid
group by sex,moviename
having sex="F" -- having sex="M"即為男性
order by avgrate desc
limit 100;
4蛔糯、求movieid = 2116這部電影各年齡段(因為年齡就只有7個,就按這個7個分就好了)的平均影評(年齡段窖式,影評分)
select a.age as age,avg(b.rate) as avgrate
from t_user a
join t_rating b on a.userid=b.userid
where b.movieid=2116
group by age
order by avgrate desc;
5蚁飒、求最喜歡看電影(影評次數最多)的那位女性評最高分的10部電影的平均影評分(觀影者,電影名萝喘,影評分)
select a.userid ,count(a.rate) as ratecount
from t_rating a
join t_user b on a.userid=b.userid
where b.sex="F"
group by a.userid
order by ratecount desc
limit 5;
create table answer_B as
select a.movieid as movieid, a.rate as rate
from t_rating a
where a.userid=1150
order by rate desc;
select * from answer_B limit 10;
select b.moviename as moviename,avg(c.rate) as avgrate
from answer_B a
join t_movie b on a.movieid=b.movieid
join t_rating c on b.movieid=c.movieid
group by moviename;
6淮逻、求好片(評分>=4.0)最多的那個年份的最好看的10部電影
create table answer6_A as
select a.movieid as movieid,a.moviename as moviename,substr(a.moviename,-5,4) as year,avg(b.rate) as avgrate
from t_movie a
join t_rating b on a.movieid=b.movieid
group by a.movieid,a.moviename;
select * from answer6_A limit 10;
select year,count(avgrate>=4) as count
from answer6_A
group by year
order by count desc
limit 5;
select movieid,moviename,avgrate
from answer6_A
where year=1998
order by avgrate desc
limit 10;
7、求1997年上映的電影中阁簸,評分最高的10部Comedy類電影
create table answer7_A as
select a.movieid as id,a.moviename as name,a.year as year,a.avgrate as avgrate,b.movietype as type
from answer6_A a
join t_movie b on a.movieid=b.movieid;
select * from answer7_A limit 10;
select id,name,avgrate,type
from answer7_A
where year=1997 and instr(lcase(type),"comedy")>0
order by avgrate desc
limit 10;
8爬早、該影評庫中各種類型電影中評價最高的5部電影(類型,電影名启妹,平均影評分)
create table answer8_A as
select a.id as id ,a.name as name,a.year as year,a.avgrate as avgrate,tv.typesplit as typesplit
from answer7_A a
lateral view explode(split(type,"\\|")) tv as typesplit;
create table answer8_B as
select id,year,lcase(typesplit) as type,name,avgrate,row_number() over(partition by lcase(typesplit) order by avgrate desc) as num
from answer8_A;
select * from answer8_B
limit 10;
select type,name,avgrate from answer8_B
where num<=5;
9筛严、各年評分最高的電影類型(年份,類型翅溺,影評分)
create table answer9_A as
select year,lcase(typesplit) as type,avg(avgrate) as rate
from answer8_A
group by year,lcase(typesplit) ;
select * from answer9_A limit 10;
create table answer9_B as
select year,type,rate,row_number() over (partition by year order by rate) as num
from answer9_A;
select * from answer9_B where num=1;
10脑漫、每個地區(qū)最高評分的電影名髓抑,把結果存入HDFS(地區(qū),電影名优幸,影評分)
create table answer10_A as
select a.zipcode as city,c.moviename as name, avg(b.rate) as avgrate
from t_user a
join t_rating b on a.userid=b.userid
join t_movie c on b.movieid=c.movieid
group by a.zipcode, c.moviename;
create table answer10_B as
select city,name,avgrate,row_number() over(partition by city order by avgrate desc) as num
from answer10_A;
select * from answer10_B limit 10;
insert overwrite directory "/wt/answer10"
row format delimited fields terminated by "\t"
select * from answer10_B where num=1;
參考資料:
(1)Xshell 怎么上傳文件到Linux
(2)Hive學習之路 (十二)Hive SQL練習之影評案例
(3)金融數據分析案例第四篇《Hive初步學習》