構(gòu)建clickhouse復(fù)雜數(shù)據(jù)模型

智能路徑

  • 輸入况凉,在數(shù)據(jù)范圍內(nèi)指定結(jié)束事件與窗口大小
  • 返回本姥,按用戶訪問(wèn)時(shí)間由小到大排序后的路徑字符串
select
  d_i,
  arrayStringConcat(
    arrayMap(
      b - > tupleElement(b, 1),
      arraySort(
        y - > tupleElement(y, 2),
        arrayFilter(
          (x, y, z) - > toDateTimeOrZero(z) - toDateTimeOrZero(y) < 1000,
          arrayMap(
            (x, y) - > (x, y),
            groupArray(e_t),
            groupArray(time)
          ),
          groupArray(time),
          arrayWithConstant(
            length(groupArray(time)),
            maxIf(time, e_t = 'launch')
          )
        )
      )
    ),
    '->'
  ) path
from
  bw.scene_tracker
where
  d_i <> ''
group by
  d_i


例子

上述例子窗口大小為1000s妻枕,結(jié)束事件 “l(fā)aunch”嚷往; 億級(jí)數(shù)據(jù)妙出

  • 簡(jiǎn)版

select path,count(1)cn from(
select uid, maxIf( ts, url = 'https://ark.analysys.cn/browseGoods' ) as maxTime,
arrayFilter(x->maxTime-x.1<60*30*1000 and maxTime>=x.1 , groupArray( (ts,url) )) as window_data,
arraySort(x->x.1,window_data) as sort_data,
arrayStringConcat(sort_data.2,'->') as path
from bw.session group by uid ) where path<>'' group by path order by cn desc limit 11

  • 對(duì)路徑中相鄰頁(yè)面重復(fù)的數(shù)據(jù)進(jìn)行去重
select path,count(1)cn from(
select uid, maxIf( ts, url = 'https://ark.analysys.cn/browseGoods' ) as maxTime,
arrayFilter(x->maxTime-x.1<60*30*1000 and maxTime>=x.1 , groupArray( (ts,url) )) as window_data,
arraySort(x->x.1,window_data) as sort_data,
arrayFilter((x,y)->x<>sort_data[y-1].2 ,sort_data.2,arrayEnumerate( sort_data )) as sort_data_url, --相鄰去重
arrayStringConcat(sort_data_url,'->') as path
from bw.session group by uid ) where path<>'' group by path order by cn desc limit 11

  • 線上環(huán)境測(cè)試版
select   data, count(1) cn from (
 with  maxIf( c_t , cat='page_view'and act='頁(yè)面_瀏覽') as max_time, -- 目標(biāo)事件時(shí)間
         arraySort(
           e -> e.1,
           arrayFilter(x->x.1<=toUInt64OrZero(max_time),groupArray((toUInt64OrZero(c_t), (cat,act) )))
         ) as sorted_array,
          -- 按時(shí)間排序后的數(shù)據(jù)
         arrayPushFront( sorted_array, sorted_array[1] ) as e_arr,
         arrayFilter(
           (i, e,z) -> z.1  < toUInt64OrZero(max_time)  
                   and   (e > 1800000 or (z.2.1='page_view' and z.2.2='頁(yè)面_瀏覽')),  
            arrayEnumerate(e_arr), arrayDifference( e_arr.1 ),e_arr
         ) as arr_indx, -- 過(guò)濾目標(biāo)事件葛账、時(shí)間差后的數(shù)據(jù)
         arrayReduce('max',arr_indx) as smIndx,
         arrayFilter(
           (e,i) ->  i>=smIndx and e.1<=toUInt64OrZero(max_time)   ,  
           sorted_array, arrayEnumerate(sorted_array) 
         ) as data_ 
 
 select u_i,
         arrayFilter((x,y)-> y<>0 ,data_.2,arrayDifference(arrayEnumerateDense(data_.2))) as data__,  
         arraySlice(data__,length(data__)-8,8 ) as data,
        --  arrayStringConcat(data,'->') as path,
         hasAll(data, [ ('page_view','頁(yè)面_瀏覽') ]) as has_way_point 
    from app.scene_tracker where c_p='PC'   and length(u_i)>20  
    group by u_i  having length(data)>1 
  ) tab
where has_way_point=1 group by data order by cn desc limit 1000


易觀 OLAP Session分析

http://ds.analysys.cn/2019/session.html

1、計(jì)算默認(rèn)session每天的會(huì)話次數(shù)皮仁、人均訪問(wèn)時(shí)長(zhǎng)籍琳、退出率

SELECT
    day,
    countDistinct(sid) AS scn,
    countDistinct(uid) AS ucn,
    sum(t2 - t1) / ucn AS dur,
    countIf(t1 = t2) / scn AS t_rate
FROM
(
    SELECT
        day,
        uid,
        sid,
        min(ts2) AS t1,
        max(ts2) AS t2
    FROM bw.session2
    GROUP BY
        day,
        uid,
        sid
)
GROUP BY day
ORDER BY day ASC

2、根據(jù)動(dòng)態(tài)session計(jì)算每日會(huì)話次數(shù)

  • 第一版
#  4s
select
  day,
  sum(length(sessions))
from
  (
    select
      day,
      arrayFilter(
        (y, z) - > dateDiff(
          'minute',
          toDateTimeOrZero(y),
          toDateTimeOrZero(z)
        ) > 30,
        arraySort(x - > x, groupArray(time)),
        arrayPushBack(
          arrayPopFront(arraySort(x - > x, groupArray(time))),
          '2029-09-08 23:21:30'
        )
      ) sessions
    from
      bw.scene_tracker
    where
      d_i <> ''
    group by
      day,
      d_i
  )
where
  length(sessions) > 0
group by
  day


  • 第二版
# 4s
select day,sumArray(arrayFilter((z,x,y)->dateDiff( 'minute', toDateTimeOrZero(x) ,toDateTimeOrZero(y)) >30,tupleElement(sessions,3),tupleElement(sessions,1), tupleElement(sessions,2)) ) from (
select day, arrayMap((x,y) -> ( x,y,1),
arraySort(groupArray(time)),
arrayPushBack(arrayPopFront(arraySort(x->x, groupArray(time))),'2029-09-08 23:21:30')) sessions
from bw.scene_tracker where d_i<>'' group by day, d_i )    group by day

  • 第三版
# 4s

select day,sum(length(arrayFilter((x,y)->dateDiff( 'minute', toDateTimeOrZero(x) ,toDateTimeOrZero(y)) >30, t1, t2)) ) from (
select day,  
arraySort(x->x, groupArray(time )) t1 ,
arrayPushBack(arrayPopFront( t1 ),'2029-09-08 23:21:30') t2
from bw.scene_tracker where d_i<>'' group by day, d_i )    group by day limit 1111
 
  • 終結(jié)版
# 使用超時(shí)時(shí)間30分鐘+跨天的session切割規(guī)則贷祈,計(jì)算出20190501-20190510趋急,每天的會(huì)話次數(shù) 
select day ,sumArray( sessions ) from (
select day,  arrayMap(( y,z) -> if(dateDiff( 'minute', y ,z)>30,1,0)  , 
arraySort(groupArray( ts2)) as t1,
arrayPushBack(arrayPopFront(t1) ,addYears(now(),1000))) sessions 
from bw.session2    group by day, uid ) where length(sessions)>0 group by day  

  • 簡(jiǎn)化版
select day ,sum(length( sessions )) from (
select day, arrayFilter(x ->  x>30*60*1000,
arrayDifference(arraySort( groupArray( ts ))))   sessions
from bw.session2    group by day, uid )   group by day  

  • 整合版
select day , sum(arrayUniq(psarray)) pscn, sum(arrayUniq(psarray)-length(intersect)) /sum(length(idxx)) as t_rate from (   
select day,       
arraySort(x->x.2, groupArray( (url,ts,event_code )))as cur,   
arrayFilter( (x,y,z)->  y>30*60*1000 or z='$start_event'   , range( length(cur)) , arrayDifference(cur.2) ,cur.3)   as idx ,
arrayPushBack(idx ,length(cur)) as idx2, 
arrayEnumerate(idx2) as idxx, 
flatten(arrayMap((x,y)->arrayResize([concat(toString(x),toString(uid)) ], x-idx2[y-1], concat(toString(x),toString(uid)) ), idx2,  idxx ) ) as tkarray ,
arrayFilter((x,y)-> y='https://ark.analysys.cn/browseGoods' , tkarray, cur.1) as psarray  , 
arrayFilter((x,y)-> y<>'https://ark.analysys.cn/browseGoods' , tkarray, cur.1) as psother,
arrayIntersect(psarray,psother) intersect 
from bw.session2    group by day, uid )  group by day 

  • 另一種實(shí)現(xiàn) 性能一般般
# 結(jié)果與上面一致
select day , sumArray(tp.3) pscn, sumArray(tp.2) /sum(length(tp)) as trate from (   
select day,       
arraySort(x->x.2, groupArray( (url,ts )))as cur,   
arrayFilter( (x,y )->  y>30*60*1000 , range( length(cur)) , arrayDifference(cur.2) )   as idx ,
arrayPushBack(idx ,length(cur)) as idx2, 
arrayEnumerate(idx2) as idxx, 
arrayMap((x,y)-> (arraySlice(cur.1, idx2[y-1]+1, x-idx2[y-1]) as session, if(hasAny(session,['https://ark.analysys.cn/browseGoods']) and  arrayUniq(session)=1,1,0) as depth, hasAny(session,['https://ark.analysys.cn/browseGoods']) as pscn),  idx2, idxx) tp 
from bw.session2    group by day, uid )  group by day 


2、根據(jù)動(dòng)態(tài)session計(jì)算每日著陸頁(yè)的跳出率

跳出率=訪問(wèn)了一個(gè)頁(yè)面的Session數(shù)/總的Session數(shù)

  • 第一版
select day ,sumArray( sessions.1 )/sumArray( sessions.2 ) from (  
select day,  arrayMap(( x,y,z) -> (if(dateDiff( 'minute', y.2 ,z.2)>30 and endsWith(y.1,'index'),1,0) ,if(dateDiff(   'minute', y.2 ,z.2)>30,1,0))  ,  
arrayMap((x,y)->(x,y),groupArray( url ), groupArray( ts2))as data,  
arraySort(x->x.2, data) as t1,  
arrayPushBack(arrayPopFront(t1) ,('',addYears(now(),1000)))  
) sessions   
from bw.session2    group by day, uid )  group by day 

3势誊、 使用超時(shí)時(shí)間30分鐘+跨天+指定開(kāi)始事件宣谈,的session切割規(guī)則計(jì)算出20190501-20190510,每天包含某個(gè)頁(yè)面行為的會(huì)話總數(shù)键科,人均訪問(wèn)深度。

  • 第一版
--每天包含某個(gè)頁(yè)面行為的會(huì)話總數(shù)

select day , sum(arrayUniq(cn)) from (  
select day,     
arraySort(x->x.2,arrayMap((x,y,z)->(x,y,z),groupArray( url ), groupArray( ts2 ), groupArray( event_code )))as cur,  
arrayPushBack(arrayPopFront(cur) ,('', addYears(now(),1000) ,'')) as next,
arrayEnumerate(cur) as inx,
arrayFilter( (x,y,z)-> dateDiff( 'minute', y.2 ,z.2)>30 or z.3='$start_event' , inx, cur, next)   as idx,
arrayEnumerate(idx) as idxx,
flatten(arrayMap((x,y)->arrayWithConstant(x-idx[y-1],x), idx,  idxx) ) as tkarray ,
arrayResize( tkarray, length( cur ), length( cur )) as narray,
arrayFilter((x,y)-> y.1='https://ark.analysys.cn/browseGoods' , narray, cur)  cn
from bw.session2    group by day, uid )  group by day 

--人均訪問(wèn)深度

select day , sum(arrayUniq(cn)) pcn,sum(length(idx)) scn from (  
select day,     
arraySort(x->x.2,arrayMap((x,y,z)->(x,y,z),groupArray( url ), groupArray( ts2 ), groupArray( event_code )))as cur,  
arrayPushBack(arrayPopFront(cur) ,('', addYears(now(),1000) ,'')) as next,
arrayEnumerate(cur) as inx,
arrayFilter( (x,y,z)-> dateDiff( 'minute', y.2 ,z.2)>30 or z.3='$start_event' , inx, cur, next)   as idx,
arrayEnumerate(idx) as idxx,
flatten(arrayMap((x,y)->arrayWithConstant(x-idx[y-1],x), idx,  idxx) ) as tkarray ,
arrayResize( tkarray, length( cur ), length( cur )) as narray,
arrayMap((x,y)-> concat(y.1,'$$',toString( x))  , narray, cur)  cn
from bw.session2    group by day, uid )  group by day 

  • 第二版(整合版)
-- 第三題 簡(jiǎn)版
select day , sum(arrayUniq(psarray)) pscn, sum(arrayUniq(deptharray)) /sum(length(idxx)) as avgdepth from (   
select day,       
arraySort(x->x.2, groupArray( (url,ts,event_code )))as cur,   
arrayFilter( (x,y,z)->  y>30*60*1000 or z='$start_event'   , range( length(cur)) , arrayDifference(cur.2) ,cur.3)   as idx ,
arrayPushBack(idx ,length(cur)) as idx2, 
arrayEnumerate(idx2) as idxx, 
flatten(arrayMap((x,y)->arrayWithConstant(x-arrayElement(idx2, y-1), concat(toString(x),toString(uid)) ), idx2,  idxx ) ) as tkarray ,
arrayFilter((x,y)-> y='https://ark.analysys.cn/browseGoods' , tkarray, cur.1) as psarray, 
arrayMap((x,y)-> concat(y,'$$',   x )  , tkarray, cur.1) as deptharray  
from bw.session2    group by day, uid )  group by day  


  • 第三版
# 用 arrayResize 函數(shù)替代 arrayWithConstant 性能提升2s
select day , sum(arrayUniq(psarray)) pscn, sum(arrayUniq(deptharray)) /sum(length(idxx)) as avgdepth from (   
select day,       
arraySort(x->x.2, groupArray( (url,ts,event_code )))as cur,   
arrayFilter( (x,y,z)->  y>30*60*1000 or z='$start_event'   , range( length(cur)) , arrayDifference(cur.2) ,cur.3)   as idx ,
arrayPushBack(idx ,length(cur)) as idx2, 
arrayEnumerate(idx2) as idxx, 
flatten(arrayMap((x,y)->arrayResize([concat(toString(x),toString(uid)) ], x-idx2[y-1], concat(toString(x),toString(uid)) ), idx2,  idxx ) ) as tkarray ,
arrayFilter((x,y)-> y='https://ark.analysys.cn/browseGoods' , tkarray, cur.1) as psarray, 
arrayMap((x,y)-> concat(y,   x )  , tkarray, cur.1) as deptharray  
from bw.session2    group by day, uid )  group by day 

  • 另一種解題思路
select day , sum(pacn) pscn, sum(cl) /sum(asm) as avgdepth from (   
select day,       
arraySort(x->x.2, groupArray( (url,ts )))as cur,   
-- arrayMap( (y,z)->  if(y>30*60*1000 or z='https://ark.analysys.cn/startUp' ,1,0)  , arrayDifference(cur.2) ,cur.1)   as idx ,
arrayMap((y,z)->  if(y>30*60*1000 or z='https://ark.analysys.cn/startUp' ,1,0), arrayDifference(cur.2) ,cur.1) amap,
arrayCumSum(amap) as rsid,
arraySum(amap )+1 asm,
arrayFilter((x,y)-> y='https://ark.analysys.cn/browseGoods' , rsid, cur.1) as psarray, 
arrayUniq(  psarray )  as pacn,length(cur) as cl
from bw.session3    group by day, uid )  group by day  


建表語(yǔ)句

CREATE TABLE bw.session2 (`uid` Int64, `ts` UInt64, `event_code` String, `sid` String, `url` String, `platform` String, `source` String, `city` String, `brand` String, `buy_count` Int32, `price` Float64, `day` LowCardinality(String), `ts2` DateTime, `uid2` LowCardinality(String)) ENGINE = MergeTree PARTITION BY tuple() ORDER BY (day, uid2, ts2) SETTINGS index_granularity = 8192

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
  • 序言:七十年代末漩怎,一起剝皮案震驚了整個(gè)濱河市勋颖,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌勋锤,老刑警劉巖饭玲,帶你破解...
    沈念sama閱讀 216,324評(píng)論 6 498
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件,死亡現(xiàn)場(chǎng)離奇詭異叁执,居然都是意外死亡茄厘,警方通過(guò)查閱死者的電腦和手機(jī),發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 92,356評(píng)論 3 392
  • 文/潘曉璐 我一進(jìn)店門谈宛,熙熙樓的掌柜王于貴愁眉苦臉地迎上來(lái)次哈,“玉大人,你說(shuō)我怎么就攤上這事吆录∫ぶ停” “怎么了?”我有些...
    開(kāi)封第一講書人閱讀 162,328評(píng)論 0 353
  • 文/不壞的土叔 我叫張陵,是天一觀的道長(zhǎng)哀卫。 經(jīng)常有香客問(wèn)我巨坊,道長(zhǎng),這世上最難降的妖魔是什么裁蚁? 我笑而不...
    開(kāi)封第一講書人閱讀 58,147評(píng)論 1 292
  • 正文 為了忘掉前任术幔,我火速辦了婚禮赡译,結(jié)果婚禮上,老公的妹妹穿的比我還像新娘占调。我一直安慰自己,他們只是感情好勋磕,可當(dāng)我...
    茶點(diǎn)故事閱讀 67,160評(píng)論 6 388
  • 文/花漫 我一把揭開(kāi)白布妈候。 她就那樣靜靜地躺著,像睡著了一般挂滓。 火紅的嫁衣襯著肌膚如雪苦银。 梳的紋絲不亂的頭發(fā)上,一...
    開(kāi)封第一講書人閱讀 51,115評(píng)論 1 296
  • 那天赶站,我揣著相機(jī)與錄音幔虏,去河邊找鬼。 笑死贝椿,一個(gè)胖子當(dāng)著我的面吹牛想括,可吹牛的內(nèi)容都是我干的。 我是一名探鬼主播烙博,決...
    沈念sama閱讀 40,025評(píng)論 3 417
  • 文/蒼蘭香墨 我猛地睜開(kāi)眼瑟蜈,長(zhǎng)吁一口氣:“原來(lái)是場(chǎng)噩夢(mèng)啊……” “哼!你這毒婦竟也來(lái)了渣窜?” 一聲冷哼從身側(cè)響起铺根,我...
    開(kāi)封第一講書人閱讀 38,867評(píng)論 0 274
  • 序言:老撾萬(wàn)榮一對(duì)情侶失蹤,失蹤者是張志新(化名)和其女友劉穎乔宿,沒(méi)想到半個(gè)月后位迂,有當(dāng)?shù)厝嗽跇?shù)林里發(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 45,307評(píng)論 1 310
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡详瑞,尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 37,528評(píng)論 2 332
  • 正文 我和宋清朗相戀三年掂林,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片坝橡。...
    茶點(diǎn)故事閱讀 39,688評(píng)論 1 348
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡泻帮,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出计寇,到底是詐尸還是另有隱情刑顺,我是刑警寧澤氯窍,帶...
    沈念sama閱讀 35,409評(píng)論 5 343
  • 正文 年R本政府宣布,位于F島的核電站蹲堂,受9級(jí)特大地震影響狼讨,放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜柒竞,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 41,001評(píng)論 3 325
  • 文/蒙蒙 一政供、第九天 我趴在偏房一處隱蔽的房頂上張望。 院中可真熱鬧朽基,春花似錦布隔、人聲如沸。這莊子的主人今日做“春日...
    開(kāi)封第一講書人閱讀 31,657評(píng)論 0 22
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽(yáng)。三九已至霎俩,卻和暖如春哀军,著一層夾襖步出監(jiān)牢的瞬間,已是汗流浹背打却。 一陣腳步聲響...
    開(kāi)封第一講書人閱讀 32,811評(píng)論 1 268
  • 我被黑心中介騙來(lái)泰國(guó)打工杉适, 沒(méi)想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留,地道東北人柳击。 一個(gè)月前我還...
    沈念sama閱讀 47,685評(píng)論 2 368
  • 正文 我出身青樓猿推,卻偏偏與公主長(zhǎng)得像,于是被迫代替她去往敵國(guó)和親捌肴。 傳聞我的和親對(duì)象是個(gè)殘疾皇子蹬叭,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 44,573評(píng)論 2 353

推薦閱讀更多精彩內(nèi)容

  • 注:內(nèi)容來(lái)源于網(wǎng)易云課堂產(chǎn)品經(jīng)理微專業(yè) 一、數(shù)據(jù)分析概述 數(shù)據(jù)是量化事物的手段状知,數(shù)據(jù)指標(biāo)代表了現(xiàn)實(shí)存在的客觀情況具垫,...
    夜妖黑貓閱讀 3,340評(píng)論 3 49
  • ORA-00001: 違反唯一約束條件 (.) 錯(cuò)誤說(shuō)明:當(dāng)在唯一索引所對(duì)應(yīng)的列上鍵入重復(fù)值時(shí),會(huì)觸發(fā)此異常试幽。 O...
    我想起個(gè)好名字閱讀 5,306評(píng)論 0 9
  • 每天進(jìn)步一點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)點(diǎn)~~從開(kāi)始只能寫幾句話、模仿別人的觀點(diǎn)卦碾,到現(xiàn)...
    一個(gè)帥氣的名字呀閱讀 18,077評(píng)論 4 31
  • 國(guó)家電網(wǎng)公司企業(yè)標(biāo)準(zhǔn)(Q/GDW)- 面向?qū)ο蟮挠秒娦畔?shù)據(jù)交換協(xié)議 - 報(bào)批稿:20170802 前言: 排版 ...
    庭說(shuō)閱讀 10,952評(píng)論 6 13
  • [實(shí)用方法][數(shù)學(xué)原理] 封面用研整編文章 在用戶分析領(lǐng)域铺坞,對(duì)用戶行為洞察的需求正盛。本文介紹常用的Session...
    CoverUER閱讀 5,605評(píng)論 0 11