關於hive 分桶重排序的一個栗子
阿新 • • 發佈:2022-03-09
需求
原始資料
year | tag |
2014 | 1 |
2015 | 1 |
2016 | 0 |
2017 | 0 |
2018 | 0 |
2020 | 1 |
2021 | 1 |
2022 | 1 |
結果資料
2014 | 1 | 1 |
2015 | 1 | 2 |
2016 | 0 | 1 |
2017 | 0 | 2 |
2018 | 0 | 3 |
2020 | 1 | 1 |
2021 | 1 | 2 |
2022 | 1 | 3 |
資料準備
create table temp_block_sort_0309 ( year int, tag int ) stored as orc tblproperties ('orc.compress' = 'snappy'); insert into temp_block_sort_0309 values (2014, 1), (2015, 1), (2016, 0), (2017, 0), (2018, 0), (2020, 1), (2021, 1), (2022, 1); select * fromtemp_block_sort_0309;
解題方案
解決這個問題可以分 3 步走 1、判定當前行和上一行的第二列是否相等,如果不等置為1,相等為0,作為flag列
select *, case when lag(tag, 1) over (order by year) != tag then 1 else 0 end as flag from temp_block_sort_0309;
year | tag | flag |
2014 | 1 | 0 |
2015 | 1 | 0 |
2016 | 0 | 1 |
2017 | 0 | 0 |
2018 | 0 | 0 |
2020 | 1 | 1 |
2021 | 1 | 0 |
2022 | 1 | 0 |
select *, sum(flag) over (order by year) tong_id from (select *, case when lag(tag, 1) over (order by year) != tag then 1 else 0 end as flag from temp_block_sort_0309) t;
year | tag | flag | tong_id |
2014 | 1 | 0 | 0 |
2015 | 1 | 0 | 0 |
2016 | 0 | 1 | 1 |
2017 | 0 | 0 | 1 |
2018 | 0 | 0 | 1 |
2020 | 1 | 1 | 2 |
2021 | 1 | 0 | 2 |
2022 | 1 | 0 | 2 |
select year, tag, row_number() over (partition by tong_id order by year) js_id, flag from (select *, sum(flag) over (order by year) tong_id from (select *, case when lag(tag, 1) over (order by year) != tag then 1 else 0 end as flag from temp_block_sort_0309) t) t order by 1;
year | tag | js_id | flag_id |
2014 | 1 | 1 | 0 |
2015 | 1 | 2 | 0 |
2016 | 0 | 1 | 1 |
2017 | 0 | 2 | 0 |
2018 | 0 | 3 | 0 |
2020 | 1 | 1 | 1 |
2021 | 1 | 2 | 0 |
2022 | 1 | 3 | 0 |