PostgreSQL Limit對索引的影響
阿新 • • 發佈:2018-11-10
伺服器CPU排行榜
相關行業的同學如看不懂應該該好好反思一下自己了,思考人生了.
1.建立測試表
drop table if exists test;
create table test(
objectid serial not null,
num integer not null,
ref integer[] not null,
constraint pk_test_objectid primary key(objectid)
)with (fillfactor=100);
alter table test cluster on pk_test_objectid;
為加快插入速度,其它索引在生成資料完成後再建立.
2.建立函式
函式用於控制num和ref的值分佈,以便num和ref欄位上的索引具有較高的可選擇性.
drop function if exists saveAsTest(integer,integer[]);
drop function if exists gen_row(integer[],tweights[],tweights[]);
drop function if exists gen_array(integer[],tweights[]);
drop function if exists get_next_index(tweights[]);
drop type if exists tweights;
/****************************************************************************************
建立平滑加權輪詢係數型別
weight:設定的係數
curweight:當前使用的係數,初始化設定為0即可
****************************************************************************************/
create type tweights as (weight integer,curweight integer);
/****************************************************************************************
平滑加權輪詢(smooth weighted round-robin balancing)演算法
示例: array[((50,0)::tweights),((30,0)::tweights),((15,0)::tweights),((5,0)::tweights)]
配置了4個係數引數,注意所有係數值累加為100,每呼叫一百次
第一個係數返回索引1的概率為50%
第二個係數返回索引2的概率為30%
第三個係數返回索引3的概率為15%
第四個係數返回索引4的概率為5%
****************************************************************************************/
create or replace function get_next_index(tweights[])
returns table(index integer, weights tweights[])
as $$
declare
v_i integer;
v_len integer;
v_index integer;
v_total integer;
v_tmp tweights;
v_tmpindex tweights;
begin
v_len := array_length($1,1);
if (1 = v_len) then
return query select 1,$1;
end if;
v_index := -1; v_total := 0;
for v_i in 1..v_len loop
v_tmp := $1[v_i];
v_tmp.curweight := (v_tmp.curweight + v_tmp.weight);
v_total := (v_total + v_tmp.weight);
$1[v_i] = v_tmp;
if (-1 = v_index or ($1[v_index]).curweight < v_tmp.curweight) then
v_index := v_i;
end if;
end loop;
v_tmpindex := $1[v_index];
v_tmpindex.curweight := v_tmpindex.curweight - v_total;
$1[v_index] = v_tmpindex;
return query select v_index,$1;
end;
$$ language plpgsql strict;
/****************************************************************************************
隨機生成1-4個元素的陣列
drop function if exists gen_array(integer[],tweights[]);
****************************************************************************************/
create or replace function gen_array(integer[],tweights[])
returns table(vals integer[], weights tweights[])
as $$
with recursive cte(id,val,weights,count) as (
(select 1,$1[index],weights,((random()*(4-1)+1)::integer) from get_next_index($2))
union all
select (p.id+1),$1[a.index],a.weights,p.count from cte as p,get_next_index(p.weights) as a where p.id < count
) select array_agg(val),(select weights from cte where id=count) from cte;
$$ language sql strict;
/****************************************************************************************
生成行
$1、$2、$3的陣列大小必須一至
$2:為生成integer的平滑加權輪詢係數
$3:為生成integer[]的平滑加權輪詢係數
drop function if exists gen_row(integer[],tweights[],tweights[]);
****************************************************************************************/
create or replace function gen_row(integer[],tweights[],tweights[])
returns table(num integer,weights1 tweights[],ref integer[],weights2 tweights[])
as $$
select $1[num.index],num.weights,ref.*
from get_next_index($2) as num,gen_array($1,$3) as ref;
$$ language sql strict;
/****************************************************************************************
函式測試是否符合預期
****************************************************************************************/
/*
select *
from gen_row(
array[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
array[
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
],
array[
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
]);
*/
/****************************************************************************************
儲存資料到Test表
drop function if exists saveAsTest(integer,integer[]);
****************************************************************************************/
create or replace function saveAsTest(integer,integer[])
returns integer
as $$
insert into test(num,ref) values($1,$2) returning objectid;
$$ language sql strict;
3.生成測試資料
- num的值範圍為1-20,平均分佈(各個的值佔比為5%).
- ref的值範圍為1-20,陣列大小控制在1-4(隨機大小),每生成100個數值各個值的佔比也為5%.
delete from test;
select setval(pg_get_serial_sequence('test','objectid'), 1, false);
/****************************************************************************************
匯入測試資料,開10個終端,每個終端都執行以下指令碼.
博主測試機cpu為雙路16核,因此開了16個終端.CPU型號為Intel(R) Xeon(R) CPU E5530 @ 2.40GHz,現屬於垃圾cpu,排行榜在倒數...
因表比較簡單匯入測試資料硬碟寫入較少(最高約16MB/s,大多數情況下小於2MB/s).
本例主要是cpu運算,因此16個終端同時執行cpu達到了100%.kao運行了一會風扇狂響.......
****************************************************************************************/
\timing on
do $$
declare
v_nums integer[];
v_weights1 tweights[];
v_weights2 tweights[];
v_num integer;
v_ref integer[];
v_coun integer;
begin
v_coun := 1;
v_nums:=array[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20];
v_weights1:=array[
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
];
v_weights2:=array[
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
];
for i in 1..1000000 loop
select num,weights1,ref,weights2 into v_num,v_weights1,v_ref,v_weights2 from gen_row(v_nums,v_weights1,v_weights2);
perform saveAsTest(v_num,v_ref);
--raise notice '% %', v_num,v_ref;
if ( 0 = (i % 1000) ) then
raise notice '%', v_coun;
v_coun := v_coun + 1;
end if;
end loop;
end;
$$;
序號 | 耗時(ms) |
---|---|
1 | 1491206.016 |
2 | 1511390.919 |
3 | 1517245.568 |
4 | 1509241.432 |
5 | 1519552.252 |
6 | 1514420.896 |
7 | 1520820.174 |
8 | 1512984.280 |
9 | 1519851.215 |
10 | 1514590.502 |
11 | 1505463.332 |
12 | 1503091.390 |
13 | 1503749.024 |
14 | 1501670.722 |
15 | 1500027.669 |
16 | 1503459.150 |
4.建立索引
插入完成後vacuum表,測試時結果更準確.
vacuum freeze verbose analyze test;
select count(*) from test;
/*
count
----------
16000000
(1 row)
Time: 587.956 ms
*/
/*B樹索引*/
create index idx_test_num on test(num);
/*陣列索引
使用gin__int_ops,截止目前根據我的需求陣列索引測試下來gin__int_ops效果最好
gin__int_ops依賴intarray擴充套件
create extension intarray;
*/
create index idx_test_ref on test using gin(ref gin__int_ops);
/*其它陣列型別索引,需要相關擴充套件*/
--create index idx_test_ref on test using gist(ref gist__int_ops);
--create index idx_test_ref on test using rum(ref rum_anyarray_ops);
/*可以檢視一下表結構*/
\dS+ test;
5.查詢測試
注意不要加order by,order by會影響執行計劃,目前只單純的測試limit和索引之間的關係.
執行查詢時多執行幾次,直至不讀取磁碟(沒有Buffers: shared read).
因為資料在表中的佔比一樣,因此只要查詢一個值就可以了.
/*表包含的資料,b樹索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=1;
--Execution time: 2568.059 ms
/*表裡不包含的資料,b樹索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=21;
--Execution time: 0.044 ms
/*表包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[1];
--Execution time: 6589.734 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2];
--Execution time: 9037.726 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3];
--Execution time: 11621.418 ms
/*表不包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[21];
--Execution time: 0.065 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22];
--Execution time: 0.056 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23];
--Execution time: 0.060 ms
6.常規limt測試
/*表包含的資料,b樹索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=1 limit 50;
--Execution time: 0.535 ms
/*表裡不包含的資料,b樹索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=21 limit 50;
--Execution time: 0.050 ms
/*表包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[1] limit 50;
--Execution time: 0.585 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2] limit 50;
--Execution time: 0.561 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3] limit 50;
--Execution time: 0.537 ms
/*表不包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[21] limit 50;
--Execution time: 3572.286 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22] limit 50;
--Execution time: 3944.530 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23] limit 50;
--Execution time: 4130.662 ms
通過對比可以看到B樹索引新增limit效能更高,只返回limit限定的資料,無論表中是否包含條件值.
陣列索引分兩種情況,表中包含條件值、表中不包含條件值.
6.1 陣列索引和limit
6.1.1 表中包含條件值
不會使用陣列索引,使用全表掃描,但是有limit限定,所以速度很快.
6.1.2 表中不包含條件值
不會使用陣列索引,使用全表掃描,因為值不包含在表中,所以需要全表掃描,然後過濾所有資料,速度非常慢.
6.1.2.1 解決方案-使用with
with會使用陣列索引.
/*表包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
with cte as(
select objectid from test where [email protected]>array[1]
)select * from cte limit 10;
--Execution time: 293.301 ms
explain (analyze,verbose,costs,buffers,timing)
with cte as(
select objectid from test where ref&&array[1,2]
)select * from cte limit 10;
--Execution time: 464.427 ms
explain (analyze,verbose,costs,buffers,timing)
with cte as(
select objectid from test where ref&&array[1,2,3]
)select * from cte limit 10;
--Execution time: 717.172 ms
/*表不包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
with cte as(
select objectid from test where [email protected]>array[21]
)select * from cte limit 10;
--Execution time: 0.075 ms
explain (analyze,verbose,costs,buffers,timing)
with cte as(
select objectid from test where ref&&array[21,22]
)select * from cte limit 10;
--Execution time: 0.078 ms
explain (analyze,verbose,costs,buffers,timing)
with cte as(
select objectid from test where ref&&array[21,22,23]
)select * from cte limit 10;
--Execution time: 0.079 ms
6.1.2.2 解決方案-禁用全表掃描
禁用全表掃描後,PostgreSQL會自動選擇合適的索引,在本例中使用了索引idx_test_ref.類似Oracle的強制索引.
set enable_seqscan只對當前會話有效,注意使用完成後要開啟.
set enable_seqscan = off;
/*表包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[1] limit 50;
--Execution time: 297.018 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2] limit 50;
--Execution time: 466.661 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3] limit 50;
--Execution time: 708.372 ms
/*表不包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[21] limit 50;
--Planning time: 0.089 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22] limit 50;
--Execution time: 0.065 ms
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23] limit 50;
--Execution time: 0.066 ms
set enable_seqscan = on;
6.1.3 小結
- 索引掃描的成本較昂貴,但因返回的資料少,所以比較快.
- limit會對查詢行為產生較大的影響,設定了limit後需重新檢視執行計劃.
- order by也會對查詢行為產生較大的影響,需結合需求和執行計劃調整.
- 如果是單個條件(例如本例),且大多數情況下表包含值,建議使用”6.常規limt測試”,偶爾有表不包含的值時對總體影響不大.
- 如果是多個條件,建議使用”6.1.2.1 解決方案-使用with”,它和禁用全表掃描效果差不多.具體使用那種需結合需求和執行計劃調整.如下:
--多個條件
explain (analyze,verbose,costs,buffers,timing)
with cte as(
select objectid from test where num=1 and ref&&array[1,2,3]
)select * from cte limit 10;
explain (analyze,verbose,costs,buffers,timing)
with cte as(
select objectid from test where num=1 and ref&&array[21,22,23]
)select * from cte limit 10;