|
本文内容遵从CC版权协议, 可以随意转载, 但必须以超链接形式标明文章原始出处和作者信息及版权声明
网址: http://www.penglixun.com/tech/database/mysql_outer_join_analyse.html 上次讨论了MySQL INNER JOIN算法的效率,怪自己没看仔细官方文档,实际上MySQL对内联查询采用了“下推”的方法,见官方文档。
理论上下推也是可以用到外联接上的,没看懂官方的那段伪代码,根据自己的想法写了一段测试代码,就是昨天代码的改进。
下面是官方给出的采用下推的算法:
FOR each row t1 in T1 such that C1(t1) {
BOOL f1:=FALSE;
FOR each row t2 in T2
such that P1(t1,t2) AND (f1?C2(t2):TRUE) {
BOOL f2:=FALSE;
FOR each row t3 in T3
such that P2(t2,t3) AND (f1&&f2?C3(t3):TRUE) {
IF (f1&&f2?TRUE:(C2(t2) AND C3(t3))) {
t:=t1||t2||t3; OUTPUT t;
}
f2=TRUE;
f1=TRUE;
}
IF (!f2) {
IF (f1?TRUE:C2(t2) && P(t1,t2,NULL)) {
t:=t1||t2||NULL; OUTPUT t;
}
f1=TRUE;
}
}
IF (!f1 && P(t1,NULL,NULL)) {
t:=t1||NULL||NULL; OUTPUT t;
}
}
|
下面是我写的测试,包括内联查询和左联查询的测试:
#include <iostream>
#include <cstdlib>
#include <time.h>
#define MAXN 10000
#define LIMIT 500
using namespace std;
//计时器
class Timer {
public :
//构造函数
Timer ();
//析构函数
~Timer ();
//开始计时
void begin();
//计时结束
void end();
//获取时间
double get_time();
private :
clock_t start, finish;
double time;
};
Timer::Timer () {
start = 0;
finish = 0;
}
Timer::~Timer () {
start = 0;
finish = 0;
}
void Timer::begin () {
start = clock();
}
void Timer::end () {
finish = clock();
}
double Timer::get_time() {
time = (double)(finish-start)/CLOCKS_PER_SEC;
return time;
}
int a[MAXN];
int b[MAXN];
int c[MAXN];
int d[MAXN];
int p[4][2];
//初始化测试数据
void init () {
srand(time(0));
//参与关键查询的数据
//cout << "a\tb\tc\td" << endl;
for(int i=0; i<MAXN; ++i) {
a[i] = i;
b[i] = rand()%MAXN;
c[i] = rand()%MAXN;
d[i] = rand()%MAXN;
//cout << a << "\t"<< b << "\t" << c << "\t" << d << endl;
}
//查询的限制条件
for (int i=0; i<4; ++i) {
cout << i << ": ";
for (int j=0; j<2; ++j) {
//p[0]随机一个小于MAXN的1/2的数,p[1]随机一个大于MAXN的1/2的数
p[i][j] = rand()%(int)(MAXN/2) + (int)(MAXN/2)*j;
}
cout << p[i][0] << ", " << p[i][1] << endl;
}
return ;
}
//格式化打印
void print(int &cnt,int x, int y, int z) {
cnt++;
if (cnt%LIMIT == 0) {
cout << "Row " << cnt << ": ";
if (z!=-1) {
cout << x << ',' << y << ',' << z << endl;
}
else {
if (y!=-1) {
cout << x << ',' << y << ',' << "N" << endl;
}
else {
cout << x << ',' << "N" << ',' << "N" << endl;
}
}
}
return ;
}
//内联查询测试
//SELECT *
//FROM a INNER JOIN b ON a.id=b.id
//INNER JOIN c ON b.id=c.id
//WHERE a.id BETWEEN p00 AND p01
//AND b.id BETWEEN p10 AND p11
//AND c.id BETWEEN p20 AND p11
void innerJoin () {
int count1, count2;
Timer timer;
double time1, time2;
cout << "====Inner Join Test====" << endl;
//直接JOIN再判断筛选条件
cout << "Test1:" << endl;
count1 = 0;
timer.begin ();
for(int i=0; i<MAXN; ++i) {
for(int j=0; j<MAXN; ++j) {
if (a[i]==b[j]) {
for(int k=0; k<MAXN; ++k) {
if(b[j]==c[k]) {
if (a[i]>p[0][0] && a[i]<p[0][1] \
&& b[j]>p[1][0] && b[j]<p[1][1] \
&& c[k]>p[2][0] && c[k]<p[2][1]) {
print(count1, a[i], b[j], c[k]);
}
}
}
}
}
}
timer.end ();
time1 = timer.get_time();
//先判断筛选条件再JOIN
cout << "Test2:" << endl;
count2 = 0;
timer.begin ();
for(int i=0; i<MAXN; ++i) {
if (a[i]>p[0][0] && a[i]<p[0][1]) {
for(int j=0; j<MAXN; ++j) {
if (a[i]==b[j] && b[j]>p[1][0] && b[j]<p[1][1]) {
for(int k=0; k<MAXN; ++k) {
if(b[j]==c[k] && c[k]>p[2][0] && c[k]<p[2][1]) {
print(count2, a[i], b[j], c[k]);
}
}
}
}
}
}
timer.end ();
time2 = timer.get_time();
//校验数据的正确性并输出
if(count1 == count2) {
cout << endl;
cout << count1 << " Rows: ";
cout << time1 << " VS " << time2 << endl;
}
return ;
}
//外联查询测试
//SELECT *
//FROM a LEFT JOIN
//(b LEFT JOIN c ON b.id=c.id)
//ON a.id=b.id
//WHERE a.id BETWEEN p00 AND p01
//AND b.id BETWEEN p10 AND p11
//AND c.id BETWEEN p20 AND p11
void leftJoin () {
bool flag1, flag2;
int count1, count2, count3;
Timer timer;
double time1, time2, time3;
cout << "====Left Join Test====" << endl;
//先JOIN再判断条件
cout << "Test1:" << endl;
count1 = 0;
timer.begin ();
for(int i=0; i<MAXN; ++i) {
flag1 = false; //标记a,b表是否有匹配行
for(int j=0; j<MAXN; ++j) {
if (a[i]==b[j]) {
flag2 = false; //标记b,c表是否有匹配行
for(int k=0; k<MAXN; ++k) {
if(b[j]==c[k]) {
if (a[i]>p[0][0] && a[i]<p[0][1] \
&& b[j]>p[1][0] && b[j]<p[1][1] \
&& c[k]>p[2][0] && c[k]<p[2][1]) {
print(count1, a[i], b[j], c[k]);
flag2 = true;
flag1 = true;
}
}
}
if(!flag2) {
if(a[i]>p[0][0] && a[i]<p[0][1] \
&& b[j]>p[1][0] && b[j]<p[1][1]) {
print(count1, a[i], b[j], -1);
flag1 = true;
}
}
}
}
if(!flag1) {
if(a[i]>p[0][0] && a[i]<p[0][1]) {
print(count1, a[i], -1, -1);
}
}
}
timer.end ();
time1 = timer.get_time();
//先判断条件再JOIN
cout << "Test2:" << endl;
count2 = 0;
timer.begin ();
for(int i=0; i<MAXN; ++i) {
flag1 = false;
if (a[i]>p[0][0] && a[i]<p[0][1]) {
for(int j=0; j<MAXN; ++j) {
if (a[i]==b[j] && b[j]>p[1][0] && b[j]<p[1][1]) {
flag2 = false;
for(int k=0; k<MAXN; ++k) {
if(b[j]==c[k] && c[k]>p[2][0] && c[k]<p[2][1]) {
print(count2, a[i], b[j], c[k]);
flag1 = true;
flag2 = true;
}
}
if(!flag2) {
print(count2, a[i], b[j], -1);
flag1 = true;
}
}
}
if(!flag1) {
print(count2, a[i], -1, -1);
}
}
}
timer.end ();
time2 = timer.get_time();
//先判断选择条件再判断关联条件再JOIN,其实跟上面基本是一样的
cout << "Test3:" << endl;
count3 = 0;
timer.begin ();
for(int i=0; i<MAXN; ++i) {
if (a[i]>p[0][0] && a[i]<p[0][1]) {
flag1 = false;
for(int j=0; j<MAXN; ++j) {
if (b[j]>p[1][0] && b[j]<p[1][1]) {
if(a[i]==b[j]) {
flag2 = false;
for(int k=0; k<MAXN; ++k) {
if(c[k]>p[2][0] && c[k]<p[2][1]) {
if(b[j]==c[k]) {
print(count3, a[i], b[j], c[k]);
flag1 = true;
flag2 = true;
}
}
}
if(!flag2) {
print(count3, a[i], b[j], -1);
flag1 = true;
}
}
}
}
if(!flag1) {
print(count3, a[i], -1, -1);
}
}
}
timer.end ();
time3 = timer.get_time();
//校验数据的正确性并输出
if(count1==count2 && count2==count3) {
cout << endl;
cout << "Fetch Rows: " << count1 << endl;
cout << time1 << " VS " << time2 << " VS " << time3 << endl;
}
else {
cout << "Error: " << count1 << " <> " << count2 << " <> " << count3 << endl;
}
return ;
}
int main() {
init();
innerJoin();
leftJoin();
return 0;
}
|
对于左联查询,我的测试结果是,Test2的方法好于Test1,Test3经常还不如Test1,虽然加入了选择条件判断。
主要原因应该是,C++判断AND条件只要有一个不满足就判定为false,于是if越少判断越快。
欢迎探讨MySQL的算法实现和效率。 |
|