實(shí)戰(zhàn)背景
新聞網(wǎng)站
- 版塊
- 新聞頁面
- 新用戶注冊
- 用戶跳出
案例需求分析
- 每天每個頁面的PV
PV是Page View皱埠,是指一個頁面被所有用戶訪問次數(shù)的總和熏版,頁面被訪問一次就被記錄1次PV - 每天每個頁面的UV
UV是User View畔柔,是指一個頁面被多少個用戶訪問了氯夷,一個用戶訪問一次是1次UV,一個用戶訪問多次還是1次UV - 新用戶注冊比率
當(dāng)天注冊用戶數(shù) / 當(dāng)天未注冊用戶的訪問數(shù) - 用戶跳出率
IP只瀏覽了一個頁面就離開網(wǎng)站的次數(shù)/網(wǎng)站總訪問數(shù)(PV) - 版塊熱度排行榜
根據(jù)每個版塊每天被訪問的次數(shù)靶擦,做出一個排行榜
網(wǎng)站日志格式
date timestamp userid pageid section action
日志字段說明
date: 日期腮考,yyyy-MM-dd格式
timestamp: 時間戳
userid: 用戶id
pageid: 頁面id
section: 新聞版塊
action: 用戶行為,兩類玄捕,點(diǎn)擊頁面和注冊
模擬數(shù)據(jù)生成程序
public class OfflineDataGenerator {
public static void main(String[] args) throws Exception {
StringBuffer buffer = new StringBuffer("");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Random random = new Random();
String[] sections = new String[] {"country", "international", "sport", "entertainment", "movie", "carton", "tv-show", "technology", "internet", "car"};
int[] newOldUserArr = new int[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
// 生成日期踩蔚,默認(rèn)就是昨天
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.DAY_OF_YEAR, -1);
Date yesterday = cal.getTime();
String date = sdf.format(yesterday);
// 生成10000000條訪問數(shù)據(jù)
for(int i = 0; i < 10000000; i++) {
// 生成時間戳
long timestamp = System.currentTimeMillis();
// 生成隨機(jī)userid(默認(rèn)1000注冊用戶,每天1/10的訪客是未注冊用戶)
Long userid = 0L;
int newOldUser = newOldUserArr[random.nextInt(10)];
if(newOldUser == 1) {
userid = null;
} else {
userid = (long) random.nextInt(1000);
}
// 生成隨機(jī)pageid(總共1k個頁面)
Long pageid = (long) random.nextInt(1000);
// 生成隨機(jī)版塊(總共10個版塊)
String section = sections[random.nextInt(10)];
// 生成固定的行為枚粘,view
String action = "view";
buffer.append(date).append("?")
.append(timestamp).append("?")
.append(userid).append("?")
.append(pageid).append("?")
.append(section).append("?")
.append(action).append("\n");
}
// 生成100000條注冊數(shù)據(jù)
for(int i = 0; i < 100000; i++) {
// 生成時間戳
long timestamp = System.currentTimeMillis();
// 新用戶都是userid為null
Long userid = null;
// 生成隨機(jī)pageid馅闽,都是null
Long pageid = null;
// 生成隨機(jī)版塊,都是null
String section = null;
// 生成固定的行為赌结,view
String action = "register";
buffer.append(date).append("?")
.append(timestamp).append("?")
.append(userid).append("?")
.append(pageid).append("?")
.append(section).append("?")
.append(action).append("\n");
}
PrintWriter pw = null;
try {
pw = new PrintWriter(new OutputStreamWriter(
new FileOutputStream("C:\\Users\\ZJ\\Desktop\\access.log")));
pw.write(buffer.toString());
} catch (Exception e) {
e.printStackTrace();
} finally {
pw.close();
}
}
}
創(chuàng)建相關(guān)表
在hive中創(chuàng)建訪問日志表
create table news (
date string,
timestamp bigint,
userid bigint,
pageid bigint,
section string,
action string);
將模擬數(shù)據(jù)導(dǎo)入hive表中
load data local inpath '/opt/spark-study/news.log' into table news;
編碼
main方法
public static void main(String[] args) {
SparkSession sparkSession = SparkSession.builder().appName("NewsOfflineStatSpark").enableHiveSupport().getOrCreate();
String yesterday = getYesterday();
// 開發(fā)第一個關(guān)鍵指標(biāo):頁面pv統(tǒng)計以及排序
calculateDailyPagePv(sparkSession, yesterday);
// 開發(fā)第二個關(guān)鍵指標(biāo):頁面uv統(tǒng)計以及排序
calculateDailyPageUv(sparkSession, yesterday);
// 開發(fā)第三個關(guān)鍵指標(biāo):新用戶注冊比率統(tǒng)計
calculateDailyNewUserRegisterRate(sparkSession, yesterday);
// 開發(fā)第四個關(guān)鍵指標(biāo):用戶跳出率統(tǒng)計
calculateDailyUserJumpRate(sparkSession, yesterday);
// 開發(fā)第五個關(guān)鍵指標(biāo):版塊熱度排行榜
calculateDailySectionPvSort(sparkSession, yesterday);
}
getYesterday方法
private static String getYesterday() {
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.DAY_OF_YEAR, -1);
Date yesterday = cal.getTime();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
return sdf.format(yesterday);
}
}
每天每個頁面的PV
private static void calculateDailyPagePv(SparkSession sparkSession, String yesterday) {
// select date,pageid, pv from(
// select date,pageid,count(pageid) as pv from news where date = '2019-01-24' and action = 'view' group by date,pageid
// ) t
// order by pv desc;
String sql =
"select date, pageid, pv from ( " +
"select date, pageid, count(pageid) as pv from news " +
"where date = '" + yesterday + "' " +
" and action = " + "'view' " +
"group by date, pageid " +
") t " +
"order by pv desc";
Dataset<Row> dataset = sparkSession.sql(sql);
dataset.show();
}
每天每個頁面的UV
private static void calculateDailyPageUv(SparkSession sparkSession, String yesterday) {
// select date,pageid, uv from (
// select date, pageid, count(userid) as uv from (
// select date,pageid,userid from news where date = '2019-01-24' and action = 'view' group by date,pageid,userid
// ) t1
// group by date,pageid
// ) t2
// order by uv desc;
String sql =
"select date,pageid, uv from ( " +
"select date, pageid, count(userid) as uv from ( " +
"select date,pageid,userid from news " +
"where date = '" + yesterday + "' " +
"and action = 'view' " +
"group by date,pageid,userid " +
") t1 " +
"group by date,pageid " +
") t2 " +
"order by uv desc ";
Dataset<Row> dataset = sparkSession.sql(sql);
dataset.show();
}
新用戶注冊比率
private static void calculateDailyNewUserRegisterRate(SparkSession sparkSession, String yesterday) {
String sql1 = "SELECT count(*) FROM news WHERE action='view' AND date='" + yesterday + "' AND userid IS NULL";
String sql2 = "SELECT count(*) FROM news WHERE action='register' AND date='" + yesterday + "' ";
Dataset<Row> sql = sparkSession.sql(sql1);
Long result1 = sql.collectAsList().get(0).getLong(0);
long number1 = 0L;
if(result1 != null) {
number1 = result1;
}
Dataset<Row> sql3 = sparkSession.sql(sql2);
Long result2 = sql3.collectAsList().get(0).getLong(0);
long number2 = 0L;
if(result2 != null) {
number2 = result2;
}
// 計算結(jié)果
System.out.println("======================" + number1 + "======================");
System.out.println("======================" + number2 + "======================");
double rate = (double)number2 / (double)number1;
System.out.println("======================" + rate + "======================");
}
用戶跳出率
private static void calculateDailyUserJumpRate(SparkSession sparkSession, String yesterday) {
// 網(wǎng)站總訪問數(shù)
String sql1 = "select count(*) from news where action='view' and date='" + yesterday + "' and userid is not null";
// select date,userid,count(userid) as time from news where action='view' and date='2019-01-26' and userid is not null group by date,userid;
// 已注冊用戶的昨天跳出的總數(shù)
String sql2 =
"select count(userid) from ( " +
"select date,userid,count(userid) as time from news where action='view' and date='" + yesterday + "' and userid is not null group by date,userid " +
") t " +
"where time = 1";
Dataset<Row> sql = sparkSession.sql(sql1);
Long result1 = sql.collectAsList().get(0).getLong(0);
long number1 = 0L;
if(result1 != null) {
number1 = result1;
}
Dataset<Row> sql3 = sparkSession.sql(sql2);
Long result2 = sql3.collectAsList().get(0).getLong(0);
long number2 = 0L;
if(result2 != null) {
number2 = result2;
}
// 計算結(jié)果
System.out.println("======================" + number1 + "======================");
System.out.println("======================" + number2 + "======================");
double rate = (double)number2 / (double)number1;
System.out.println("======================" + rate + "======================");
}
版塊熱度排行榜
private static void calculateDailySectionPvSort(SparkSession sparkSession, String yesterday) {
// select date,section,count(section) as num from news where action='view' and date='2019-01-25' group by date,section
String sql =
"select date,section,num from ( " +
"select date,section,count(section) as num from news where action='view' and date='" + yesterday + "' group by date,section " +
") t " +
"order by num desc";
Dataset<Row> sql1 = sparkSession.sql(sql);
sql1.show();
}