充分吸收這十題,hivesql面試沒問題(上)
第一題
需求:
我們有如下的使用者訪問資料:
userId visitDate visitCount
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
U02 2017/1/23 6
U01 2017/2/22 4
要求使用SQL統計出每個使用者的累積訪問次數,如下表所示: 使用者id 月份 小計 累積實現
資料準備*/
CREATE TABLE test_sql.test1 (
userId string,
visitDate string,
visitCount INT )
ROW format delimited FIELDS TERMINATED BY "\t";
INSERT INTO TABLE test_sql.test1
VALUES
( 'u01', '2017/1/21', 5 ),
( 'u02', '2017/1/23', 6 ),
( 'u03', '2017/1/22', 8 ),
( 'u04', '2017/1/20', 3 ),
( 'u01', '2017/1/23', 6 ),
( 'u01', '2017/2/21', 8 ),
( 'u02', '2017/1/23', 6 ),
( 'u01', '2017/2/22', 4 );
查詢SQL:
SELECT t2.userid,
t2.visitmonth,
subtotal_visit_cnt,
sum(subtotal_visit_cnt) over (partition BY userid
ORDER BY visitmonth) AS total_visit_cnt
FROM
(SELECT userid,
visitmonth,
sum(visitcount) AS subtotal_visit_cnt
FROM
(SELECT userid,
date_format(regexp_replace(visitdate,'/','-'),'yyyy-MM') AS visitmonth,
visitcount
FROM test_sql.test1) t1
GROUP BY userid,
visitmonth)t2
ORDER BY t2.userid,
t2.visitmonth;
第二題
需求:
有50W個京東店鋪,每個顧客訪客訪問任何一個店鋪的任何一個商品時都會產生一條訪問日誌,
訪問日誌儲存的表名為Visit,訪客的使用者id為user_id,被訪問的店鋪名稱為shop,資料如下:
請統計:
(1)每個店鋪的UV(訪客數)
(2)每個店鋪訪問次數top3的訪客資訊。輸出店鋪名稱、訪客id、訪問次數
資料準備:
CREATE TABLE test_sql.test2 (
user_id string,
shop string )
ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE test_sql.test2 VALUES
( 'u1', 'a' ),
( 'u2', 'b' ),
( 'u1', 'b' ),
( 'u1', 'a' ),
( 'u3', 'c' ),
( 'u4', 'b' ),
( 'u1', 'a' ),
( 'u2', 'c' ),
( 'u5', 'b' ),
( 'u4', 'b' ),
( 'u6', 'c' ),
( 'u2', 'c' ),
( 'u1', 'b' ),
( 'u2', 'a' ),
( 'u2', 'a' ),
( 'u3', 'a' ),
( 'u5', 'a' ),
( 'u5', 'a' ),
( 'u5', 'a' );
查詢SQL實現
方式1:
每個店鋪的UV(訪客數)
SELECT shop,count(DISTINCT user_id)
FROM test_sql.test2
GROUP BY shop;
方式2:
每個店鋪的UV(訪客數)
SELECT t.shop,count()
FROM
(SELECT user_id,shop
FROM test_sql.test2
GROUP BY user_id,shop) t
GROUP BY t.shop;
//每個店鋪訪問次數top3的訪客資訊。輸出店鋪名稱、訪客id、訪問次數
SELECT t2.shop,t2.user_id,t2.cnt
FROM
(SELECT t1.,
row_number() over(partition BY t1.shop ORDER BY t1.cnt DESC) rank
FROM
(SELECT user_id,shop,count(*) AS cnt
FROM test_sql.test2
GROUP BY user_id,shop) t1)t2
WHERE rank <= 3;
第三題
需求
已知一個表STG.ORDER,有如下欄位:Date,Order_id,User_id,amount。
資料樣例:2017-01-01,10029028,1000003251,33.57。
請給出sql進行統計:
(1)給出 2017年每個月的訂單數、使用者數、總成交金額。
(2)給出2017年11月的新客數(指在11月才有第一筆訂單)
實現
資料準備*/
CREATE TABLE test_sql.test3 (
dt string,
order_id string,
user_id string,
amount DECIMAL ( 10, 2 ) )
ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','10029028','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','10029029','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','100290288','1000003252',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','10029088','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','100290281','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','100290282','1000003253',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-11-02','10290282','100003253',234);
INSERT INTO TABLE test_sql.test3 VALUES ('2018-11-02','10290284','100003243',234);
查詢SQL
(1)給出 2017年每個月的訂單數、使用者數、總成交金額。
SELECT t1.mon,
count(t1.order_id) AS order_cnt,
count(DISTINCT t1.user_id) AS user_cnt,
sum(amount) AS total_amount
FROM
(SELECT order_id,
user_id,
amount,
date_format(dt,'yyyy-MM') mon
FROM test_sql.test3
WHERE date_format(dt,'yyyy') = '2017') t1
GROUP BY t1.mon;
(2)給出2017年11月的新客數(指在11月才有第一筆訂單)
第三題第二問
select count(user_id)
from (
SELECT user_id
FROM test_sql.test3
GROUP BY user_id
HAVING date_format(min(dt), 'yyyy-MM') = '2017-01'
);
第三題 引申 第三問,每個月的新使用者
select min_mon, count(user_id)
from (
SELECT user_id,
date_format(min(dt), 'yyyy-MM') min_mon
FROM test_sql.test3
GROUP BY user_id
) a
group by min_mon
order by min_mon;
第四題
需求
有一個5000萬的使用者檔案(user_id,name,age),
一個2億記錄的使用者看電影的記錄檔案(user_id,url),根據年齡段(每10歲)觀看電影的次數進行排序?
資料準備:
CREATE TABLE test_sql.test4user
(
user_id string,
name string,
age int
);
CREATE TABLE test_sql.test4log
(
user_id string,
url string
);
INSERT INTO TABLE test_sql.test4user
VALUES ('001', 'u1', 10);
INSERT INTO TABLE test_sql.test4user
VALUES ('002', 'u2', 15);
INSERT INTO TABLE test_sql.test4user
VALUES ('003', 'u3', 15);
INSERT INTO TABLE test_sql.test4user
VALUES ('004', 'u4', 20);
INSERT INTO TABLE test_sql.test4user
VALUES ('005', 'u5', 25);
INSERT INTO TABLE test_sql.test4user
VALUES ('006', 'u6', 35);
INSERT INTO TABLE test_sql.test4user
VALUES ('007', 'u7', 40);
INSERT INTO TABLE test_sql.test4user
VALUES ('008', 'u8', 45);
INSERT INTO TABLE test_sql.test4user
VALUES ('009', 'u9', 50);
INSERT INTO TABLE test_sql.test4user
VALUES ('0010', 'u10', 65);
INSERT INTO TABLE test_sql.test4log
VALUES ('001', 'url1');
INSERT INTO TABLE test_sql.test4log
VALUES ('002', 'url1');
INSERT INTO TABLE test_sql.test4log
VALUES ('003', 'url2');
INSERT INTO TABLE test_sql.test4log
VALUES ('004', 'url3');
INSERT INTO TABLE test_sql.test4log
VALUES ('005', 'url3');
INSERT INTO TABLE test_sql.test4log
VALUES ('006', 'url1');
INSERT INTO TABLE test_sql.test4log
VALUES ('007', 'url5');
INSERT INTO TABLE test_sql.test4log
VALUES ('008', 'url7');
INSERT INTO TABLE test_sql.test4log
VALUES ('009', 'url5');
INSERT INTO TABLE test_sql.test4log
VALUES ('0010', 'url1');
查詢SQL 思路一
SELECT t2.age_phase,
sum(t1.cnt) as view_cnt
FROM (SELECT user_id,
count(*) cnt
FROM test_sql.test4log
GROUP BY user_id) t1
JOIN(SELECT user_id,
CASE
WHEN age <= 10 AND age > 0 THEN '0-10'
WHEN age <= 20 AND age > 10 THEN '10-20'
WHEN age > 20 AND age <= 30 THEN '20-30'
WHEN age > 30 AND age <= 40 THEN '30-40'
WHEN age > 40 AND age <= 50 THEN '40-50'
WHEN age > 50 AND age <= 60 THEN '50-60'
WHEN age > 60 AND age <= 70 THEN '60-70'
ELSE '70以上' END as age_phase
FROM test_sql.test4user) t2 ON t1.user_id = t2.user_id
GROUP BY t2.age_phase;
思路二:
SELECT t2.age_phase,
sum(t1.cnt) as view_cnt
FROM (SELECT user_id,
count(*) cnt
FROM test_sql.test4log
GROUP BY user_id) t1
JOIN(SELECT user_id,
age,
concat(floor(age / 10) * 10, '-', (floor(age / 10) + 1) * 10) as age_phase
FROM test_sql.test4user) t2
ON t1.user_id = t2.user_id
GROUP BY t2.age_phase;
第五題
需求:
有日誌如下,請寫出程式碼求得所有使用者和活躍使用者的總數及平均年齡。(活躍使用者指連續兩天都有
訪問記錄的使用者)
--日期 使用者 年齡
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
資料準備:
CREATE TABLE test5(
dt string,
user_id string,
age int)
ROW format delimited fields terminated BY ',';
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_2',19);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_3',39);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_3',39);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-12','test_2',19);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-13','test_1',23);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-15','test_2',19);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-16','test_2',19);
查詢SQL:
SELECT sum(total_user_cnt) total_user_cnt,
sum(total_user_avg_age) total_user_avg_age,
sum(two_days_cnt) two_days_cnt,
sum(avg_age) avg_age
FROM (SELECT 0 total_user_cnt,
0 total_user_avg_age,
count() AS two_days_cnt,
cast(sum(age) / count() AS decimal(5, 2)) AS avg_age
FROM (SELECT user_id, max(age) age
FROM (SELECT user_id, max(age) age
FROM (SELECT user_id, age, dt, rank, date_sub(dt, rank) flag
FROM (SELECT dt,
user_id,
max(age) age,
row_number() over (PARTITION BY user_id
ORDER BY dt) rank
FROM test_sql.test5
GROUP BY dt, user_id) t1) t2
GROUP BY user_id, flag
HAVING count() >= 2) t3
GROUP BY user_id) t4
UNION ALL
SELECT count() total_user_cnt,
cast(sum(age) / count(*) AS decimal(5, 2)) total_user_avg_age,
0 two_days_cnt,
0 avg_age
FROM (SELECT user_id,
max(age) age
FROM test_sql.test5
GROUP BY user_id) t5) t6;
思路二:
SELECT b.total_user_cnt total_user_cnt,
b.total_user_avg_age total_user_avg_age,
a.two_days_cnt two_days_cnt,
a.avg_age avg_age
FROM (SELECT count() AS two_days_cnt,
cast(sum(age) / count() AS decimal(5, 2)) AS avg_age
FROM (SELECT user_id, max(age) age
FROM (SELECT user_id, max(age) age
FROM (SELECT user_id, age, dt, rank, date_sub(dt, rank) flag
FROM (SELECT dt,
user_id,
max(age) age,
row_number() over (PARTITION BY user_id
ORDER BY dt) rank
FROM test_sql.test5
GROUP BY dt, user_id) t1) t2
GROUP BY user_id, flag
HAVING count() >= 2) t3
GROUP BY user_id) t4) a
CROSS JOIN
(SELECT count() total_user_cnt,
cast(sum(age) / count(*) AS decimal(5, 2)) total_user_avg_age
FROM (SELECT user_id,
max(age) age
FROM test_sql.test5
GROUP BY user_id) t5) b on 1 = 1