Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Fix wrong join_rel size estimates for anti join. #934

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions src/backend/optimizer/path/clausesel.c
Original file line number Diff line number Diff line change
Expand Up @@ -407,14 +407,7 @@ clauselist_selectivity_ext(PlannerInfo *root,
}

pfree(rgsel);
/*
* For Anti Semi Join, selectivity is determined by the fraction of
* tuples that do no match
*/
if (JOIN_ANTI == jointype || JOIN_LASJ_NOTIN == jointype)
{
s1 = (1 - s1);
}

return s1;
}

Expand Down
28 changes: 28 additions & 0 deletions src/test/regress/expected/join_gp.out
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,34 @@ select * from l l1 join l l2 on l1.a = l2.a left join l l3 on l1.a = l3.a and l1
2 | 2 | 2
(5 rows)

--
-- test anti_join/left_anti_semi_join selectivities
--
create table aj_t1(a int, b int, c int) distributed by (a);
create table aj_t2(a int, b int, c int) distributed by (a);
insert into aj_t1 values(1,1,1);
insert into aj_t2 values(1,1,1),(2,2,2);
explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Anti Join
Hash Cond: ((t1.b = t2.b) AND (t1.c = t2.c))
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: t1.b, t1.c
-> Seq Scan on aj_t1 t1
-> Hash
-> Redistribute Motion 3:3 (slice3; segments: 3)
Hash Key: t2.b, t2.c
-> Seq Scan on aj_t2 t2
Optimizer: Postgres query optimizer
(11 rows)

select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
a
---
(0 rows)

--
-- test hash join
--
Expand Down
25 changes: 25 additions & 0 deletions src/test/regress/expected/join_gp_optimizer.out
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,31 @@ select * from l l1 join l l2 on l1.a = l2.a left join l l3 on l1.a = l3.a and l1
2 | 2 | 2
(5 rows)

--
-- test anti_join/left_anti_semi_join selectivities
--
create table aj_t1(a int, b int, c int) distributed by (a);
create table aj_t2(a int, b int, c int) distributed by (a);
insert into aj_t1 values(1,1,1);
insert into aj_t2 values(1,1,1),(2,2,2);
explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Anti Join
Hash Cond: ((aj_t1.b = aj_t2.b) AND (aj_t1.c = aj_t2.c))
-> Seq Scan on aj_t1
-> Hash
-> Broadcast Motion 3:3 (slice2; segments: 3)
-> Seq Scan on aj_t2
Optimizer: GPORCA
(8 rows)

select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
a
---
(0 rows)

--
-- test hash join
--
Expand Down
26 changes: 13 additions & 13 deletions src/test/regress/expected/notin.out
Original file line number Diff line number Diff line change
Expand Up @@ -317,19 +317,19 @@ select c1 from t1 where c1 > 6 and c1 not in
--
explain select c1 from t1,t2 where c1 not in
(select c3 from t3) and c1 = c2;
QUERY PLAN
-------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=5.38..8.62 rows=4 width=4)
-> Hash Join (cost=5.38..8.62 rows=2 width=4)
Hash Cond: (t1.c1 = t2.c2)
-> Hash Left Anti Semi (Not-In) Join (cost=2.26..5.46 rows=2 width=4)
Hash Cond: (t1.c1 = t3.c3)
-> Seq Scan on t1 (cost=0.00..3.10 rows=4 width=4)
-> Hash (cost=2.15..2.15 rows=3 width=4)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..2.15 rows=3 width=4)
-> Seq Scan on t3 (cost=0.00..2.03 rows=1 width=4)
-> Hash (cost=3.05..3.05 rows=2 width=4)
-> Seq Scan on t2 (cost=0.00..3.05 rows=2 width=4)
QUERY PLAN
-------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=2.12..3.25 rows=4 width=4)
-> Hash Left Anti Semi (Not-In) Join (cost=2.12..3.20 rows=1 width=4)
Hash Cond: (t1.c1 = t3.c3)
-> Hash Join (cost=1.04..2.10 rows=2 width=4)
Hash Cond: (t1.c1 = t2.c2)
-> Seq Scan on t1 (cost=0.00..1.03 rows=3 width=4)
-> Hash (cost=1.02..1.02 rows=2 width=4)
-> Seq Scan on t2 (cost=0.00..1.02 rows=2 width=4)
-> Hash (cost=1.05..1.05 rows=3 width=4)
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1.05 rows=3 width=4)
-> Seq Scan on t3 (cost=0.00..1.01 rows=1 width=4)
Optimizer: Postgres query optimizer
(12 rows)

Expand Down
12 changes: 12 additions & 0 deletions src/test/regress/sql/join_gp.sql
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ create temp table l(a int);
insert into l values (1), (1), (2);
select * from l l1 join l l2 on l1.a = l2.a left join l l3 on l1.a = l3.a and l1.a = 2 order by 1,2,3;

--
-- test anti_join/left_anti_semi_join selectivities
--
create table aj_t1(a int, b int, c int) distributed by (a);
create table aj_t2(a int, b int, c int) distributed by (a);
insert into aj_t1 values(1,1,1);
insert into aj_t2 values(1,1,1),(2,2,2);

explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);

select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);

--
-- test hash join
--
Expand Down
Loading