1. 程式人生 > 其它 >HiveSQL、SparkSQL資料傾斜優化示例(join需要保留異常資料的場景)

HiveSQL、SparkSQL資料傾斜優化示例(join需要保留異常資料的場景)

技術標籤:大資料

-- 原邏輯
select
     m1.id
    ,m1.addtime
    ,m2.id
    ,m2.updatetime
from
(
    select
         id
        ,addtime
        ,m2_id
    from test.test_table_1 t
) m1
left join
(
    select id, updatetime
    from test.test_table_2 --裡面存在id為0的資料
) m2
       on m1.m2_id=m2.id -- m1.m2_id中90%都是0,會產生資料傾斜
limit 10 ;
-- 適用於存在可列舉的個別值產生資料傾斜的場景
select
     m1.id
    ,m1.addtime
    ,m2.id
    ,m2.updatetime
from
(
    select
         id
        ,addtime
        ,m2_id
        ,case when m2_id in (0) then cast(rand() * 100 as int) else 101 end as rand_num --對異常值劃分N個分割槽,其他資料劃分到其他分割槽
    from test.test_table_1 t
)
m1 left join ( select id, updatetime,101 as rand_num -- 固定正常資料在某個分割槽 from test.test_table_2 where id <> 0 --將異常值拎出來單獨處理 union all select id ,updatetime ,cast(ex.rand_num as int) as rand_num from ( select id, updatetime from test.
test_table_2 where id = 0 --將異常值拎出來單獨處理 ) m1 --給異常資料擴充套件N份,要與上面另一張表的隨機數分割槽值N相同 lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100',',')) ex as rand_num ) m2 on m1.m2_id=m2.id -- m1.m2_id中90%都是0,會產生資料傾斜 and m1.rand_num=m2.rand_num -- 通過隨機數分割槽打散異常值的分佈 limit 10 ;
-- 適用於每次關聯時產生資料傾斜的值過多(幾十到幾百個,無法列舉)或者不確定的場景(今天是a、b產生傾斜,明天可能是c、d、e)。
-- 由於某張表的資料量翻倍,會產生更多的臨時資料,導致io增多,這種方式不一定能加速,但能讓資料更均勻地分佈,減少資料傾斜引起的OOM的可能性
select
     m1.id
    ,m1.addtime
    ,m2.id
    ,m2.updatetime
from
(
    select
         id
        ,addtime
        ,m2_id
        ,cast(rand() * 50 as int) as rand_num --對異常值劃分N個分割槽
    from test.test_table_1 t
) m1
left join
(
    select
         id
        ,tickettype
        ,money
        ,updatetime
        ,cast(ex.rand_num as int) as rand_num
    from
    (
        -- 給整個擴充套件N份資料,要與上面另一張表的隨機數分割槽值N相同
        select id, updatetime
        from test.test_table_2  --裡面存在id為0的資料
    ) m1
    lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50',',')) ex as rand_num
) m2
       on m1.m2_id=m2.id -- m1.m2_id中90%都是0,會產生資料傾斜
      and m1.rand_num=m2.rand_num -- 通過隨機數分割槽打散異常值的分佈
limit 10
;