HiveSQL、SparkSQL資料傾斜優化示例(join需要保留異常資料的場景)
阿新 • • 發佈:2021-01-30
技術標籤:大資料
-- 原邏輯
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
from test.test_table_1 t
) m1
left join
(
select id, updatetime
from test.test_table_2 --裡面存在id為0的資料
) m2
on m1.m2_id=m2.id -- m1.m2_id中90%都是0,會產生資料傾斜
limit 10
;
-- 適用於存在可列舉的個別值產生資料傾斜的場景
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
,case when m2_id in (0) then cast(rand() * 100 as int) else 101 end as rand_num --對異常值劃分N個分割槽,其他資料劃分到其他分割槽
from test.test_table_1 t
) m1
left join
(
select id, updatetime,101 as rand_num -- 固定正常資料在某個分割槽
from test.test_table_2
where id <> 0 --將異常值拎出來單獨處理
union all
select
id
,updatetime
,cast(ex.rand_num as int) as rand_num
from
(
select id, updatetime
from test. test_table_2
where id = 0 --將異常值拎出來單獨處理
) m1
--給異常資料擴充套件N份,要與上面另一張表的隨機數分割槽值N相同
lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100',',')) ex as rand_num
) m2
on m1.m2_id=m2.id -- m1.m2_id中90%都是0,會產生資料傾斜
and m1.rand_num=m2.rand_num -- 通過隨機數分割槽打散異常值的分佈
limit 10
;
-- 適用於每次關聯時產生資料傾斜的值過多(幾十到幾百個,無法列舉)或者不確定的場景(今天是a、b產生傾斜,明天可能是c、d、e)。
-- 由於某張表的資料量翻倍,會產生更多的臨時資料,導致io增多,這種方式不一定能加速,但能讓資料更均勻地分佈,減少資料傾斜引起的OOM的可能性
select
m1.id
,m1.addtime
,m2.id
,m2.updatetime
from
(
select
id
,addtime
,m2_id
,cast(rand() * 50 as int) as rand_num --對異常值劃分N個分割槽
from test.test_table_1 t
) m1
left join
(
select
id
,tickettype
,money
,updatetime
,cast(ex.rand_num as int) as rand_num
from
(
-- 給整個擴充套件N份資料,要與上面另一張表的隨機數分割槽值N相同
select id, updatetime
from test.test_table_2 --裡面存在id為0的資料
) m1
lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50',',')) ex as rand_num
) m2
on m1.m2_id=m2.id -- m1.m2_id中90%都是0,會產生資料傾斜
and m1.rand_num=m2.rand_num -- 通過隨機數分割槽打散異常值的分佈
limit 10
;