1. 程式人生 > >MySQL INNER JOIN演算法的效率分析

MySQL INNER JOIN演算法的效率分析

MySQL處理JOIN的方法如下:(摘自MySQL 5.1 參考手冊中文版)

假定我們有一個如下形式的表T1、T2、T3的聯接查詢:

SELECT * FROM T1 INNER JOIN T2 ON P1(T1,T2)
INNER JOIN T3 ON P2(T2,T3)
WHERE P(T1,T2,T3).

這裡,P1(T1,T2)和P2(T3,T3)是一些聯接條件(表示式),其中P(t1,t2,t3)是表T1、T2、T3的列的一個條件。

巢狀環聯接演算法將按下面的方式執行該查詢:

FOR each row t1 in T1 {
    FOR each row t2 in T2 such that P1(
t1,t2) { FOR each row t3 in T3 such that P2(t2,t3) { IF P(t1,t2,t3) { t:=t1||t2||t3; OUTPUT t; } } } }

符號t1||t2||t3表示“連線行t1、t2和t3的列組成的行”。

其實我覺得,完全可以把P(t1, t2, t3)拆到進入迴圈前就處理,像這樣(後來仔細看了文件,MySQL在內聯接的時候還是會優化成這樣的):

FOR each row t1 in T1 {
    IF P(
t1) { FOR each row t2 in T2 such that P1(t1,t2) { IF P(t2) { FOR each row t3 in T3 such that P2(t2,t3) { IF P(t3) { t:=t1||t2||t3; OUTPUT t; } } } } } }

甚至更快的是把條件全部合併起來:

FOR each row t1 in T1 {
    IF P(t1) {
        FOR each row t2 in T2 such that (P1(t1,t2) && P(t2)) {
            FOR each row t3 in T3 such that (P2(t2,t3) && P(t3)) {
                t:=t1||t2||t3; OUTPUT t;
            }
        }
    }
}

我寫了個程式,把方法一(MySQL的方法)和方法三的效率進行比較,明顯方法三要高。

#include <iostream>
#include <cstdlib>
#include <time.h>
#define MAXN 100000
 
using namespace std;
 
int a[MAXN];
int b[MAXN];
int c[MAXN];
int count = 0;
 
int main() {
	clock_t start, finish;
	double time1, time2;
 
	count = 0;
	srand(time(0));
 
	for(int i=0; i<MAXN; ++i) {
		a[i] = i;
		b[i] = MAXN-i;
		c[i] = rand()%MAXN;
	}
 
	start=clock();
	for(int i=0; i<MAXN; ++i) {
		for(int j=10; j<MAXN; ++j) {
			if (a[i]==b[j]) {
				for(int k=0; k<MAXN; ++k) {
					if(b[j]==c[k]) {
						if (a[i]>500 and b[j] < 800 and c[k]>120) {
							cout << ++count << ':' <<a[i] << ',' << b[j] << ',' << c[k] << endl;
						}
					}
				}
			}
		}
	}
	finish = clock();
	time1 = (double)(finish-start)/CLOCKS_PER_SEC;
 
	count = 0;
	start=clock();
	for(int i=0; i<MAXN; ++i) {
		if (a[i]>500) {
			for(int j=10; j<MAXN; ++j) {
				if (a[i]==b[j] and b[j] < 800) {
					for(int k=0; k<MAXN; ++k) {
						if(b[j]==c[k] and c[k]>120) {
							cout << ++count << ':' <<a[i] << ',' << b[j] << ',' << c[k] << endl;
						}
					}
				}
			}
		}
	}
	finish = clock();
	time2 = (double)(finish-start)/CLOCKS_PER_SEC;
 
	cout << time1 << "VS" << time2 << endl;
	return 0;
}