2018-09-27#hive資料檢查的常見方式
阿新 • • 發佈:2018-12-12
hive資料檢查的常見方式
主鍵上的資料是否用重複
select phone_segment,count(1) as cnt
from dw.dim_phone_segment_info a
group by phone_segment
having cnt > 1;
對比去重前後的資料量
select count(1), count(distinct phone_num) from dw.dim_phone_profile a;
取每個分組中的唯一一條
drop table temp.zhjq_tmp_cc_phone; create table temp.zhjq_tmp_cc_phone as select aa.user_num, aa.user_province, aa.user_city from( select a.user_num, case when length(trim(a.user_area_province)) > 0 then user_area_province else 'unknow' end user_province, case when length(trim(a.user_area_city)) > 0 then user_area_city else 'unknow' end user_city, row_number() over (partition by user_num order by time_start desc) rn from temp.icsoc_call_detail_bill_bill201807 a where length(trim(a.user_num)) > 0 ) aa where rn = 1 ;