檢查數據傾斜分布
阿新 • • 發佈:2017-07-28
inf pgsql stand lec data rownum char _id 一個
從傳統數據庫遷移到GP中一個重要的且常常被開發者忽略的概念是數據分布,沒有良好的設計表的分布鍵會導致嚴重的性能問題。下面函數將給開發者及DBA檢測一個表的數據傾斜情況。
-EOF-
從傳統數據庫遷移到GP中一個重要的且常常被開發者忽略的概念是數據分布,沒有良好的設計表的分布鍵會導致嚴重的性能問題。下面函數將給開發者及DBA檢測一個表的數據傾斜情況。
-- Function: gpmg.data_skew(character varying) -- DROP FUNCTION gpmg.data_skew(character varying); CREATE OR REPLACE FUNCTION gpmg.data_skew(tablename character varying) RETURNS text AS $BODY$ --2014-05-26,Gtlions,收集和統計數據傾斜情況 declare v_func character varying(200)='gpmg.data_skew()'; v_begin_time timestamp; v_end_time timestamp; v_status int=0; v_msg text='Done.'; v_record record; v_id integer; v_rq timestamp; v_segs integer=64; v_totalnums bigint=0; v_maxskew numeric=0.0; v_minskew numeric=0.0; v_maxskew_seg varchar(20); v_minskew_seg varchar(20); v_maxrows bigint=0; v_minrows bigint=0; v_result varchar(2000); begin v_id=nextval('gpmg.commonseq'); v_rq=now(); v_begin_time=clock_timestamp(); v_result = 'GP hava '; select into v_segs count(*) segs from gp_segment_configuration where role='p' and content<>-1; v_result = v_result||v_segs||' instances, Standard skew is '||1.0/v_segs||'. '; -- bg1 segid, bg2 節點記錄數量 execute 'insert into gpmg.commontab(seq,tabname,bg1,bg2) select '||v_id||','''||$1||''',gp_segment_id,count(*) segrownums from '||$1||' group by rollup(( gp_segment_id)) order by gp_segment_id'; select into v_segs,v_totalnums v_segs,max(bg2) from gpmg.commontab where seq=v_id and tabname=$1; --nm1 標準傾斜率, nm2 節點傾斜率, nm3 標準-節點傾斜率絕對值 update gpmg.commontab set nm1=1::numeric/v_segs,nm2=bg2::numeric/v_totalnums,nm3=abs(1::numeric/v_segs-bg2::numeric/v_totalnums) where seq=v_id and tabname=$1; select into v_maxskew,v_minskew max(nm2),min(nm2) from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null; select into v_maxskew_seg hostname from gp_segment_configuration where role='p' and content in (select bg1 from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null and nm2=v_maxskew limit 1); select into v_minskew_seg hostname from gp_segment_configuration where role='p' and content in (select bg1 from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null and nm2=v_minskew limit 1); select into v_maxrows bg2 from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null and nm2=v_maxskew limit 1; select into v_minrows bg2 from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null and nm2=v_minskew limit 1; v_result =v_result ||'You Table ['||$1||'] skew info: [table_totalrows:'||v_totalnums||', maxskew:seg-'||v_maxskew_seg||', rows-'||v_maxrows||' '||v_maxskew||', minskew:seg-'||v_minskew_seg||', rows-'||v_minrows||' '||v_minskew||']'; delete from gpmg.commontab where seq=v_id and tabname=$1; return v_result; v_end_time=clock_timestamp(); end; $BODY$ LANGUAGE plpgsql VOLATILE; ALTER FUNCTION gpmg.data_skew(character varying) OWNER TO gpadmin; GRANT EXECUTE ON FUNCTION gpmg.data_skew(character varying) TO public; GRANT EXECUTE ON FUNCTION gpmg.data_skew(character varying) TO gpadmin; bigdatagp=# select gpmg.data_skew('gpmg.manager_table'); data_skew ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------- GP hava 64 instances, Standard skew is 0.01562500000000000000. You Table [gpmg.manager_table] skew info: [table_totalrows:83, maxskew:seg-sdw16, rows-3 0.036144578313 25301205, minskew:seg-sdw2, rows-1 0.01204819277108433735] (1 row) bigdatagp=# select gpmg.data_skew('gpmg.func_log'); data_skew ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- ------------------------------------------------------------- GP hava 64 instances, Standard skew is 0.01562500000000000000. You Table [gpmg.func_log] skew info: [table_totalrows:53708, maxskew:seg-sdw10, rows-907 0.016887614508 08073285, minskew:seg-sdw7, rows-773 0.01439264169211290683] (1 row) 2014-10-14 09:53:00
-EOF-
檢查數據傾斜分布