hive 使用者訪問時長分析問題
阿新 • • 發佈:2022-03-07
需求描述
資料如下
問題:
- 使用者總量,使用者平均年齡,使用者平均觀看時長
- 每10歲一個分段,統計每個區間的使用者總量,使用者平均觀看時長
- 每個使用者最喜歡的節目
- 觀看時長大於5min的使用者總量,只要有一個節目使用者觀看時間小於5min就不能算
資料準備
create table temp_userlook_0305 ( view_date bigint, user_id string, age int, programid string, playtime int ) stored as orc tblproperties (需求一'orc.compress' = 'snappy'); insert into temp_userlook_0305 values (20220221, 'u1', 30, 'a', 4), (20220221, 'u1', 30, 'b', 10), (20220221, 'u1', 30, 'a', 2), (20220221, 'u2', 23, 'c', 1), (20220222, 'u3', 26, 'd', 3), (20220223, 'u2', 23, 'a', 2); select * from temp_userlook_0305;
select count(user_id) user_cnt, avg(age) age_avg, avg(playtime_sum) playtime_sum_avg from (select user_id, age, sum(playtime) playtime_sum from temp_userlook_0305 group by user_id, age) t;
需求二、每10歲一個分段,統計每個區間的使用者總量,使用者平均觀看時長
select count(user_id) user_cnt, avg(playtime_sum) playtime_sum_avg from (select user_id, age, sum(playtime) playtime_sum from temp_userlook_0305 group by user_id, age) t group by int(age / 10);
需求三、每個使用者最喜歡的節目
select user_id, programid, age from (select *, dense_rank() over (partition by user_id order by playtime_sum desc) rn from (select user_id, programid, age, sum(playtime) playtime_sum from temp_userlook_0305 group by user_id, programid, age) i) o where rn = 1;
需求四、觀看時長大於5min的使用者總量,只要有一個節目使用者觀看時間小於5min就不能算
select count(distinct t.user_id) user_cnt from (select user_id from temp_userlook_0305) t left join (select user_id from temp_userlook_0305 where playtime < 5) t1 where t1.user_id is null;
create table temp_userlook_0305 ( view_date bigint, user_id string, age int, programid string, playtime int ) stored as orc tblproperties ('orc.compress' = 'snappy'); insert into temp_userlook_0305 values (20220221, 'u1', 30, 'a', 4), (20220221, 'u1', 30, 'b', 10), (20220221, 'u1', 30, 'a', 2), (20220221, 'u2', 23, 'c', 1), (20220222, 'u3', 26, 'd', 3), (20220223, 'u2', 23, 'a', 2); select * from temp_userlook_0305;