1. 程式人生 > >計算頁面停留時長的另類方式

計算頁面停留時長的另類方式

計算頁面停留時長,往常計算也頁面留時長都需要跑spark或這mr任務來實現,如果能跑sql來實現那是最好不過了(sql是最好的語言),廢話不多說,直接搞起

注意:這裡使用的計算引擎是presto

首先來建立一張使用者瀏覽記錄資料表,表結構很簡單隻有3個欄位,uid 使用者id,page 頁面連結,time 進入此頁面的時間戳;如果有頁面離開時間,那離開時間-進入時間就是頁面停留時間;想的是挺好,可惜通常情況下,頁面打點只記錄進入頁面的時間,而不會記錄頁面離開時間;要計算頁面停留時間就需要通過使用者進入下個頁面nextpage的時間減去當前頁面curpage的時間這種方式計算,這也是計算頁面停留時間的難點;今天我們就用sql實現這個功能

CREATE TABLE hadoop.wh.site_flow_test ( 
uid varchar, 
page varchar, 
time long ) 
WITH ( format = 'ORC' )

測試資料

userid1,index,1539242423
userid1,index,1539243421
userid1,page,1539243421
userid1,detail,1539213421
userid1,detail,1539223421
userid1,page,1539243121
userid1,detail,1539253421
userid2,index,1539241423
userid2,index,1539243221
userid2,page,1539243411
userid2,detail,1539242421
userid2,detail,1539240421
userid2,page,1539243221
userid2,detail,1539213421

第一步對每個用的瀏覽的頁面資料聚合格式如下:uid,[page1,page2],[time1,time2];
這份資料的意思是使用者進入page1頁面的時間是time1,其中time1和time2是升序排列,具體實現如下

create table hadoop.wh.site_flow_tmp (uid,pages,arrs)
as
with a as (select uid,page,time from hadoop.wh.site_flow_test order by time  )
select uid,array_agg(page),array_agg(time) from a group by uid
結果如下

uid▼pages▼arrs▼

userid2	PrestoArray{ array=[detail, detail, index, detail, index, page, page]}	PrestoArray{ array=[1.539213421E9, 1.539240421E9, 1.539241423E9, 1.539242421E9, 1.539243221E9, 1.539243221E9, 1.539243411E9]}
userid1	PrestoArray{ array=[detail, detail, index, page, index, page, detail]}	PrestoArray{ array=[1.539213421E9, 1.539223421E9, 1.539242423E9, 1.539243121E9, 1.539243421E9, 1.539243421E9, 1.539253421E9]}

構建開始時間陣列和結束時間陣列,starts欄位是指進入對應pages下標頁面的時間,ends是對應pages下表的離開時間;userid2使用者瀏覽了7個頁面,最後一個頁面的離開時間沒辦法關聯,所以最後一個頁面的離開時間就是進入時間,計算頁面停留時長的使用就會變成0

create table hadoop.wh.site_flow_tmp2 (uid,pages,starts,ends) as
select uid,pages,slice(arrs,1,cardinality(arrs)) starts ,concat(slice(arrs,2,cardinality(arrs)),slice(arrs,cardinality(arrs),1)) ends
from hadoop.wh.site_flow_tmp 


uid▼pages▼starts▼ends▼
userid2	
PrestoArray{ array=[detail, detail, index, detail, index, page, page]}	
PrestoArray{ array=[1.539213421E9, 1.539240421E9, 1.539241423E9, 1.539242421E9, 1.539243221E9, 1.539243221E9, 1.539243411E9]}	
PrestoArray{ array=[1.539240421E9, 1.539241423E9, 1.539242421E9, 1.539243221E9, 1.539243221E9, 1.539243411E9, 1.539243411E9]}
userid1	
PrestoArray{ array=[detail, detail, index, page, index, page, detail]}	
PrestoArray{ array=[1.539213421E9, 1.539223421E9, 1.539242423E9, 1.539243121E9, 1.539243421E9, 1.539243421E9, 1.539253421E9]}	PrestoArray{ array=[1.539223421E9, 1.539242423E9, 1.539243121E9, 1.539243421E9, 1.539243421E9, 1.539253421E9, 1.539253421E9]}




用離開時間減去進入時間即可得到頁面停留時間

create table hadoop.wh.site_flow_tmp3(uid,pages,starts,result) as
SELECT uid,pages,starts,zip_with(starts, ends, (x, y) -> y- x) from hadoop.wh.site_flow_tmp2 


uid▼pages▼starts▼result▼
userid2	
PrestoArray{ array=[detail, detail, index, detail, index, page, page]}	
PrestoArray{ array=[1.539213421E9, 1.539240421E9, 1.539241423E9, 1.539242421E9, 1.539243221E9, 1.539243221E9, 1.539243411E9]}	
PrestoArray{ array=[27000.0, 1002.0, 998.0, 800.0, 0.0, 190.0, 0.0]}
userid1	
PrestoArray{ array=[detail, detail, index, page, index, page, detail]}	
PrestoArray{ array=[1.539213421E9, 1.539223421E9, 1.539242423E9, 1.539243121E9, 1.539243421E9, 1.539243421E9, 1.539253421E9]}	
PrestoArray{ array=[10000.0, 19002.0, 698.0, 300.0, 0.0, 10000.0, 0.0]}



資料平鋪,搞定

 with a as (SELECT uid,pages,starts,result from hadoop.wh.site_flow_tmp3 )
 select uid,page,starttime,pagetime from a CROSS JOIN UNNEST(pages,starts,result) t (page,starttime,pagetime)
 

uid▼page▼starttime▼pagetime▼
userid2	detail	1539213421	27000
userid2	detail	1539240421	1002
userid2	index	1539241423	998
userid2	detail	1539242421	800
userid2	index	1539243221	0
userid2	page	1539243221	190
userid2	page	1539243411	0
userid1	detail	1539213421	10000
userid1	detail	1539223421	19002
userid1	index	1539242423	698
userid1	page	1539243121	300
userid1	index	1539243421	0
userid1	page	1539243421	10000
userid1	detail	1539253421	0