Skip to content

Commit 778a7e9

Browse files
committed
Support join cardinality estimation if distinct_count is set
Currently we require max and min to be set, as they might be used to estimate the distinct count. This is unnecessarily conservative if distinct_count has actually been provided, in which case max and min won't be used at all and the presence of max or min has no influence over how good of an estimate it is.
1 parent b084aa4 commit 778a7e9

File tree

1 file changed

+18
-20
lines changed
  • datafusion/physical-plan/src/joins

1 file changed

+18
-20
lines changed

datafusion/physical-plan/src/joins/utils.rs

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -563,11 +563,14 @@ fn estimate_inner_join_cardinality(
563563
.iter()
564564
.zip(right_stats.column_statistics.iter())
565565
{
566-
// Break if any of statistics bounds are undefined
567-
if left_stat.min_value.get_value().is_none()
568-
|| left_stat.max_value.get_value().is_none()
569-
|| right_stat.min_value.get_value().is_none()
570-
|| right_stat.max_value.get_value().is_none()
566+
// Break if we don't have enough information to calculate a distinct count
567+
// If distinct_count isn't provided directly, we need min and max to be provided
568+
if (left_stat.distinct_count.get_value().is_none()
569+
&& (left_stat.min_value.get_value().is_none()
570+
|| left_stat.max_value.get_value().is_none()))
571+
|| (right_stat.distinct_count.get_value().is_none()
572+
&& (right_stat.min_value.get_value().is_none()
573+
|| right_stat.max_value.get_value().is_none()))
571574
{
572575
return None;
573576
}
@@ -2016,20 +2019,20 @@ mod tests {
20162019
),
20172020
// When we have distinct count.
20182021
(
2019-
(10, Inexact(1), Inexact(10), Inexact(10), Absent),
2020-
(10, Inexact(1), Inexact(10), Inexact(10), Absent),
2022+
(10, Absent, Absent, Inexact(10), Absent),
2023+
(10, Absent, Absent, Inexact(10), Absent),
20212024
Some(Inexact(10)),
20222025
),
20232026
// distinct(left) > distinct(right)
20242027
(
2025-
(10, Inexact(1), Inexact(10), Inexact(5), Absent),
2026-
(10, Inexact(1), Inexact(10), Inexact(2), Absent),
2028+
(10, Absent, Absent, Inexact(5), Absent),
2029+
(10, Absent, Absent, Inexact(2), Absent),
20272030
Some(Inexact(20)),
20282031
),
20292032
// distinct(right) > distinct(left)
20302033
(
2031-
(10, Inexact(1), Inexact(10), Inexact(2), Absent),
2032-
(10, Inexact(1), Inexact(10), Inexact(5), Absent),
2034+
(10, Absent, Absent, Inexact(2), Absent),
2035+
(10, Absent, Absent, Inexact(5), Absent),
20332036
Some(Inexact(20)),
20342037
),
20352038
// min(left) < 0 (range(left) > range(right))
@@ -2071,18 +2074,13 @@ mod tests {
20712074
),
20722075
// No min or max (or both).
20732076
(
2074-
(10, Absent, Absent, Inexact(3), Absent),
2075-
(10, Absent, Absent, Inexact(3), Absent),
2076-
None,
2077-
),
2078-
(
2079-
(10, Inexact(2), Absent, Inexact(3), Absent),
2080-
(10, Absent, Inexact(5), Inexact(3), Absent),
2077+
(10, Absent, Absent, Absent, Absent),
2078+
(10, Absent, Absent, Absent, Absent),
20812079
None,
20822080
),
20832081
(
2084-
(10, Absent, Inexact(3), Inexact(3), Absent),
2085-
(10, Inexact(1), Absent, Inexact(3), Absent),
2082+
(10, Inexact(2), Absent, Absent, Absent),
2083+
(10, Absent, Inexact(5), Absent, Absent),
20862084
None,
20872085
),
20882086
(

0 commit comments

Comments
 (0)