0.需求描述
統(tǒng)計(jì) 谷粒視頻 網(wǎng)站的常規(guī)指標(biāo)仔沿,各種 TopN 指標(biāo):
--統(tǒng)計(jì)視頻觀看數(shù) Top10
--統(tǒng)計(jì)視頻類別熱度 Top10
--統(tǒng)計(jì)視頻觀看數(shù) Top20 所屬類別
--統(tǒng)計(jì)視頻觀看數(shù) Top50 所關(guān)聯(lián)視頻的所屬類別 Rank
--統(tǒng)計(jì)每個(gè)類別中的視頻熱度 Top10
--統(tǒng)計(jì)每個(gè)類別中視頻流量 Top10
--統(tǒng)計(jì)上傳視頻最多的用戶 Top10 以及他們上傳的視頻
--統(tǒng)計(jì)每個(gè)類別視頻觀看數(shù) Top10
1.數(shù)據(jù)預(yù)處理
對將要處理的數(shù)據(jù)先進(jìn)行一次數(shù)據(jù)清洗习勤,過濾掉不合格的臟數(shù)據(jù)聪全,同時(shí)調(diào)整數(shù)據(jù)的格式
1.1 將影片類型一欄->People & Blogs 中的空格去掉
1.2 將第9列以后的數(shù)據(jù)(關(guān)聯(lián)的其他影片ID)用“&“符號連接 (因?yàn)樵诮ū頃r(shí)這兩列都需要設(shè)置為array類型辉饱,所以要統(tǒng)一分隔字符)
1.3 建立maven工程
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.victor</groupId>
<artifactId>ETLUtils</artifactId>
<version>0.0.1-SNAPSHOT</version>
<build>
<plugins><plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin></plugins></build>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.8</version>
<scope>system</scope>
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
</dependencies>
</project>
ETLUtilMapper.java
package com.victor.ETLUtils;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class ETLUtilsMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
// PF_ZMlw4rHs DisneyUnleashed 729 Film&Animation 288 96 3.5 2 0 1 xbSFrHzFQ0 4
// VP4qSjDNQs RJgGeYiJrj0
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
// 將第四個(gè)字段中的空格去掉
String newline = util(line);
if (StringUtils.isNotBlank(newline)) {
k.set(newline);
context.write(k, NullWritable.get());
}
}
private String util(String line) {
StringBuffer sb = new StringBuffer();
String[] fields = line.split("\t");
if (fields.length < 9) {
return null;
}
fields[3] = fields[3].replaceAll(" ", "");
for (int i = 0; i < fields.length; i++) {
// 如果字段小于9 眨八,丟棄
if (i < 9) {
sb.append(fields[i] + "\t");
} else {
if (fields.length-1 == i) {
sb.append(fields[i]);
} else {
sb.append(fields[i] + "&");
}
}
}
return sb.toString();
}
}
ETLUtilDriver.java
package com.victor.ETLUtils;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;
public class ETLUtilsDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//args = new String[] {"2.txt","output"};
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ETLUtilsDriver.class);
job.setMapperClass(ETLUtilsMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
1.4 啟動(dòng)集群蜻韭,將工程打成Jar包得院,并與數(shù)據(jù)文件一并上傳至集群中(省略)
1.5 清洗數(shù)據(jù)
[hadoop@hadoop131 software]$ yarn jar ETLUtils-0.0.1-SNAPSHOT.jar com.victor.ETLUtils.ETLUtilsDriver /videotable /videoOutput
處理前數(shù)據(jù)
處理后數(shù)據(jù)
2.Hive建表
思路:先分別建立video和user 的原始表傻铣,再建立video和user的ORC壓縮表以提高傳輸執(zhí)行效率
2.1 建立原始表
gulivideo_ori
CREATE TABLE gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>
)
row format delimited
fields terminated by '\t'
collection items terminated by '&';
guli_user_ori
create table guli_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
2.2 加載數(shù)據(jù)
load data inpath '/videotable' into table gulivideo_ori;
load data inpath '/usertable' into table guli_user_ori;
2.3 建立ORC表
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>
)
row format delimited
fields terminated by '\t'
collection items terminated by '&'
stored as orc;
create table guli_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc;
2.4 向ORC表加載數(shù)據(jù)
insert into gulivideo_orc
select * from gulivideo_ori;
insert into guli_user_orc
select * from guli_user_ori;
3. 業(yè)務(wù)分析
3.1 統(tǒng)計(jì)視頻觀看數(shù) Top10
select videoid,views
from gulivideo_orc
order by views desc
limit 10;
+--------------+-----------+--+
| videoid | views |
+--------------+-----------+--+
| dMH0bHeiRNg | 42513417 |
| 0XxI-hvPRRA | 20282464 |
| 1dmVU08zVpA | 16087899 |
| RB-wUgnyGv0 | 15712924 |
| QjA5faZF1A8 | 15256922 |
| -_CSo1gOd48 | 13199833 |
| 49IDp76kjPw | 11970018 |
| tYnn51C3X_w | 11823701 |
| pv5zWaTEVkI | 11672017 |
| D2kJZOfq7zk | 11184051 |
+--------------+-----------+--+
3.2 統(tǒng)計(jì)視頻類別熱度 Top10
3.2.1 將表中category字段數(shù)組行轉(zhuǎn)列
select views,hot from
gulivideo_orc lateral view explode (category) category_t as hot;t1
3.2.2 統(tǒng)計(jì)每個(gè)類別的觀看總數(shù)
select hot,count(*)
from t1
group by hot;t2
3.2.3 獲取觀看前10的類別
select hot,total_view
from ()t2
order by total_view desc limit 10;
SELECT hot,
total_view
FROM (
SELECT hot,
Count(*) total_view
FROM (
SELECT views,
hot
FROM gulivideo_orc lateral view explode (category) category_t as hot)t1
GROUP BY hot)t2
ORDER BY total_view DESC limit 10;
+----------------+-------------+--+
| category_name | total_view |
+----------------+-------------+--+
| Music | 179049 |
| Entertainment | 127674 |
| Comedy | 87818 |
| Film | 73293 |
| Animation | 73293 |
| Sports | 67329 |
| Games | 59817 |
| Gadgets | 59817 |
| People | 48890 |
| Blogs | 48890 |
+----------------+-------------+--+
3.3 統(tǒng)計(jì)視頻觀看數(shù) Top20 所屬類別
3.3.1 觀看數(shù)top20視頻
select views,category
from gulivideo_orc
order by views desc
limit 20;t1
3.3.2 所屬類別
select views,category
from t1 lateral view explode(category)ct as category_name;
SELECT views,
category_name
FROM (
SELECT views,
category
FROM gulivideo_orc
ORDER BY views DESC limit 20)t1 lateral VIEW explode(category)ct AS category_name ;
+-----------+----------------+--+
| views | category_name |
+-----------+----------------+--+
| 42513417 | Comedy |
| 20282464 | Comedy |
| 16087899 | Entertainment |
| 15712924 | Entertainment |
| 15256922 | Music |
| 13199833 | People |
| 13199833 | Blogs |
| 11970018 | Comedy |
| 11823701 | Music |
| 11672017 | Music |
| 11184051 | People |
| 11184051 | Blogs |
| 10786529 | Entertainment |
| 10334975 | Entertainment |
| 10107491 | Comedy |
| 9579911 | Music |
| 9566609 | Comedy |
| 8825788 | UNA |
| 7533070 | Music |
| 7456875 | Entertainment |
| 7066676 | Comedy |
| 6952767 | Entertainment |
+-----------+----------------+--+
3.4 統(tǒng)計(jì)視頻觀看數(shù) Top50 所關(guān)聯(lián)視頻的所屬類別 Rank
3.4.1 觀看數(shù)top10,關(guān)聯(lián)視頻
select
videoid,views,category,relatedid
from
gulivideo_orc
order by
views desc
limit 50;t1
3.4.2 關(guān)聯(lián)視頻行轉(zhuǎn)列
select distinct(r_id)
from
t1 lateral view explode(relatedid) relatedtable as r_id;t2
3.4.3 視頻所屬類別
select r_id,g.category
from
t2.join gulivideo_orc g on r_id = g.videoid;t3
select r_id,g.category
from
t2 join gulivideo_orc g on r_id = g.videoid;t3
3.4.4 類別展開
select category_name
from ()t3 lateral view explode(category)t as category_name;t4
3.4.5 統(tǒng)計(jì)類別個(gè)數(shù)
select category_name,count(*) hot
from
t4 group by category_name,t_sum;t5
3.4.6 所屬類別排名
select * from
t5
order by hot desc;t6
SELECT *
FROM (
SELECT category_name,
Count(*) hot
FROM (
SELECT category_name
FROM (
SELECT r_id,
g.category
FROM (
SELECT DISTINCT(r_id)
FROM (
SELECT videoid,
views,
category,
relatedid
FROM gulivideo_orc
ORDER BY views DESC limit 50)t1 lateral VIEW explode(relatedid) relatedtable as r_id)t2
JOIN gulivideo_orc g
ON r_id = g.videoid)t3 lateral VIEW explode(category)t as category_name )t4
GROUP BY category_name)t5
ORDER BY hot DESC;
+-------------------+---------+--+
| t6.category_name | t6.hot |
+-------------------+---------+--+
| Comedy | 14 |
| Entertainment | 11 |
| Music | 10 |
| Film | 3 |
| Animation | 3 |
| People | 2 |
| Blogs | 2 |
| Travel | 1 |
| Sports | 1 |
| Howto | 1 |
| Places | 1 |
| DIY | 1 |
+-------------------+---------+--+
3.5 統(tǒng)計(jì)每個(gè)類別中的視頻熱度 Top10
SELECT category_name,
views,
videoid
FROM (
SELECT category_name,
videoid,
views,
Rank() OVER(partition BY category_name order by views) rank_no
FROM (
SELECT category_name,
videoid,
views
FROM gulivideo_orc lateral view explode(category) t as category_name
GROUP BY category_name,
videoid,
views
ORDER BY category_name,
views DESC)t1
ORDER BY category_name)t2
WHERE rank_no<=10;
3.6 統(tǒng)計(jì)每個(gè)類別中視頻流量 Top10
3.7 統(tǒng)計(jì)上傳視頻最多的用戶 Top10 以及他們上傳的視頻
1.找出上傳前10的用戶
select uploader,
videos
from
guli_user_orc
order by videos desc
limit 10;t1
2.找到上傳的所有視頻
select t1.uploader,
videoid,
views
from
()t1 join gulivideo_orc g
on
t.uploader=g.uploader
order by uploader,views desc; t2
SELECT t1.uploader,
videoid,
views
FROM (SELECT uploader,
videos
FROM guli_user_orc
ORDER BY videos DESC
LIMIT 10)t1
JOIN gulivideo_orc g
ON t1.uploader = g.uploader
order by uploader,views desc;
3.8 統(tǒng)計(jì)每個(gè)類別視頻觀看數(shù) Top10
1.統(tǒng)計(jì)所有類別對應(yīng)的視頻
select
category_name,videoid,views
from
gulivideo_orc
lateral view explode(category) t as category_name;t1
2.對每個(gè)類觀看數(shù)排名
select *,rank() over(partition by category_name order by views desc) rank_no
from
()t1;t2
3.取前十
select * from
()t2
where rank_no<=10;
SELECT *
FROM (
SELECT *,
Rank() OVER(partition BY category_name ORDER BY views DESC) rank_no
FROM (
SELECT category_name,
videoid,
views
FROM gulivideo_orc lateral view explode(category) t as category_name)t1 )t2
WHERE rank_no<=10;
+-------------------+--------------+-----------+-------------+--+
| t2.category_name | t2.videoid | t2.views | t2.rank_no |
+-------------------+--------------+-----------+-------------+--+
| Animals | 2GWPOPSXGYI | 3660009 | 1 |
| Animals | xmsV9R8FsDA | 3164582 | 2 |
| Animals | 12PsUW-8ge4 | 3133523 | 3 |
| Animals | OeNggIGSKH8 | 2457750 | 4 |
| Animals | WofFb_eOxxA | 2075728 | 5 |
| Animals | AgEmZ39EtFk | 1999469 | 6 |
| Animals | a-gW3RbJd8U | 1836870 | 7 |
| Animals | 8CL2hetqpfg | 1646808 | 8 |
| Animals | QmroaYVD_so | 1645984 | 9 |
| Animals | Sg9x5mUjbH8 | 1527238 | 10 |
| Animation | sdUUx5FdySs | 5840839 | 1 |
| Animation | 6B26asyGKDo | 5147533 | 2 |
| Animation | H20dhY01Xjk | 3772116 | 3 |
| Animation | 55YYaJIrmzo | 3356163 | 4 |
| Animation | JzqumbhfxRo | 3230774 | 5 |
| Animation | eAhfZUZiwSE | 3114215 | 6 |
| Animation | h7svw0m-wO0 | 2866490 | 7 |
| Animation | tAq3hWBlalU | 2830024 | 8 |
| Animation | AJzU3NjDikY | 2569611 | 9 |
| Animation | ElrldD02if0 | 2337238 | 10 |