From ce419cf02c3cb33abb4f6a00dce8b03e37ad301d Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 12 Jun 2026 09:25:20 -0400 Subject: [PATCH 1/4] feat: Prefer `RightSemi`, `RightAnti`, `RightMark` --- datafusion/common/src/config.rs | 16 + .../physical_optimizer/join_selection.rs | 355 +++++++++++++++++- .../physical-optimizer/src/join_selection.rs | 137 +++++-- datafusion/sqllogictest/test_files/cte.slt | 2 +- .../dynamic_filter_pushdown_config.slt | 21 +- .../test_files/information_schema.slt | 2 + .../test_files/join_limit_pushdown.slt | 4 +- datafusion/sqllogictest/test_files/joins.slt | 4 +- .../test_files/push_down_filter_parquet.slt | 6 +- .../test_files/tpch/plans/q11.slt.part | 44 +-- .../test_files/tpch/plans/q18.slt.part | 30 +- .../test_files/tpch/plans/q19.slt.part | 14 +- .../test_files/tpch/plans/q2.slt.part | 84 ++--- .../test_files/tpch/plans/q20.slt.part | 48 +-- .../test_files/tpch/plans/q21.slt.part | 50 +-- .../test_files/tpch/plans/q22.slt.part | 12 +- .../test_files/tpch/plans/q4.slt.part | 14 +- .../test_files/tpch/plans/q5.slt.part | 48 +-- .../test_files/tpch/plans/q8.slt.part | 66 ++-- datafusion/sqllogictest/test_files/union.slt | 52 ++- docs/source/user-guide/configs.md | 1 + 21 files changed, 737 insertions(+), 273 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 07196d009c54c..a58ae6ef99946 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1498,6 +1498,22 @@ config_namespace! { /// will be collected into a single partition pub hash_join_single_partition_threshold_rows: usize, default = 1024 * 128 + /// The bias when choosing between `RightSemi`/`RightAnti`/`RightMark` + /// and `LeftSemi`/`LeftAnti`/`LeftMark` for semi, anti, and mark hash + /// joins. For these joins, one input's rows form the output ("preserved + /// side"), while the other input determines which rows should be kept + /// ("filter side"). A `RightSemi` hash join builds the hash table on + /// the filter side and streams the preserved side; a `LeftSemi` hash + /// join does the inverse. `RightSemi`, `RightAnti`, or `RightMark` is + /// used unless statistics show that the filter side is more than this + /// factor larger than the preserved side, comparing estimated total + /// byte sizes when both sides report them and row counts otherwise. + /// When statistics are missing on either side, the configured + /// preference is used: a bias value greater than or equal to 1 uses + /// `RightSemi`/`RightAnti`/`RightMark`, while a value below 1 uses + /// `LeftSemi`/`LeftAnti`/`LeftMark`. + pub semi_join_swap_bias: f64, default = 2.0 + /// Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. /// Build sides larger than this will use hash table lookups instead. /// Set to 0 to always use hash table lookups. diff --git a/datafusion/core/tests/physical_optimizer/join_selection.rs b/datafusion/core/tests/physical_optimizer/join_selection.rs index 29a2b59e5725d..c2f384657bc36 100644 --- a/datafusion/core/tests/physical_optimizer/join_selection.rs +++ b/datafusion/core/tests/physical_optimizer/join_selection.rs @@ -419,6 +419,335 @@ async fn test_join_with_swap_mark() { } } +/// Statistics with the given row count and byte size +fn sized_statistics(num_rows: usize, total_byte_size: usize) -> Statistics { + Statistics { + num_rows: Precision::Inexact(num_rows), + total_byte_size: Precision::Inexact(total_byte_size), + column_statistics: vec![ColumnStatistics::new_unknown()], + } +} + +/// Create a hash join between single-column inputs `l_col` and `r_col` +/// carrying the given statistics. +fn join_with_stats( + left_stats: Statistics, + right_stats: Statistics, + join_type: JoinType, + mode: PartitionMode, +) -> Arc { + let left: Arc = Arc::new(StatisticsExec::new( + left_stats, + Schema::new(vec![Field::new("l_col", DataType::Int32, false)]), + )); + let right: Arc = Arc::new(StatisticsExec::new( + right_stats, + Schema::new(vec![Field::new("r_col", DataType::Int32, false)]), + )); + Arc::new( + HashJoinExec::try_new( + Arc::clone(&left), + Arc::clone(&right), + vec![( + Arc::new(Column::new_with_schema("l_col", &left.schema()).unwrap()), + Arc::new(Column::new_with_schema("r_col", &right.schema()).unwrap()), + )], + None, + &join_type, + None, + mode, + NullEquality::NullEqualsNothing, + false, + ) + .unwrap(), + ) +} + +/// Assert that the optimized plan is a hash join of the given type whose +/// build (left) side is the input holding `expected_build_column`. +fn assert_join_orientation( + plan: &Arc, + expected_join_type: JoinType, + expected_build_column: &str, +) { + let join = plan + .downcast_ref::() + .expect("optimized plan should still be a hash join"); + assert_eq!(*join.join_type(), expected_join_type); + assert_eq!( + join.left().schema().field(0).name().as_str(), + expected_build_column + ); +} + +/// Semi, anti, and mark joins prefer `RightSemi`, `RightAnti`, or +/// `RightMark` even when the filter side is moderately larger than the +/// preserved side: a Left* join swaps when the filter side is within +/// `semi_join_swap_bias`, and the equivalent Right* join stays put. +#[tokio::test] +async fn test_semi_join_canonical_orientation_within_bias() { + // The filter side is 1.5x the preserved side, within the default bias of 2 + let preserved = || sized_statistics(1000, 10_000); + let filter = || sized_statistics(1500, 15_000); + + for join_type in [JoinType::LeftSemi, JoinType::LeftAnti, JoinType::LeftMark] { + let join = + join_with_stats(preserved(), filter(), join_type, PartitionMode::Partitioned); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, join_type.swap(), "r_col"); + } + + for join_type in [ + JoinType::RightSemi, + JoinType::RightAnti, + JoinType::RightMark, + ] { + let join = + join_with_stats(filter(), preserved(), join_type, PartitionMode::Partitioned); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, join_type, "l_col"); + } +} + +/// A filter side shown to be more than `semi_join_swap_bias` times larger +/// than the preserved side is too large to build on, so the optimizer chooses +/// `LeftSemi`, `LeftAnti`, or `LeftMark`. +#[tokio::test] +async fn test_semi_join_canonical_orientation_beyond_bias() { + // The filter side is 3x the preserved side, larger than the default bias of 2 + let preserved = || sized_statistics(1000, 10_000); + let filter = || sized_statistics(3000, 30_000); + + for join_type in [JoinType::LeftSemi, JoinType::LeftAnti, JoinType::LeftMark] { + let join = + join_with_stats(preserved(), filter(), join_type, PartitionMode::Partitioned); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, join_type, "l_col"); + } + + for join_type in [ + JoinType::RightSemi, + JoinType::RightAnti, + JoinType::RightMark, + ] { + let join = + join_with_stats(filter(), preserved(), join_type, PartitionMode::Partitioned); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, join_type.swap(), "r_col"); + } +} + +/// Absent statistics show nothing about the input size ratio. Semi, anti, and +/// mark joins therefore use `RightSemi`, `RightAnti`, and `RightMark` with the +/// default configuration settings. +#[tokio::test] +async fn test_semi_join_absent_stats_prefers_right_join_types() { + for join_type in [JoinType::LeftSemi, JoinType::LeftAnti, JoinType::LeftMark] { + let join = join_with_stats( + empty_statistics(), + empty_statistics(), + join_type, + PartitionMode::Auto, + ); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, join_type.swap(), "r_col"); + } + + for join_type in [ + JoinType::RightSemi, + JoinType::RightAnti, + JoinType::RightMark, + ] { + let join = join_with_stats( + empty_statistics(), + empty_statistics(), + join_type, + PartitionMode::Auto, + ); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, join_type, "l_col"); + } + + for join_type in [ + JoinType::Inner, + JoinType::Left, + JoinType::Right, + JoinType::Full, + ] { + let join = join_with_stats( + empty_statistics(), + empty_statistics(), + join_type, + PartitionMode::Auto, + ); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, join_type, "l_col"); + } +} + +/// Statistics covering only one input show nothing about the size ratio, so +/// the `RightSemi`/`RightAnti`/`RightMark` default applies even when the +/// filter side is known to be big and the preserved side is unknown. +#[tokio::test] +async fn test_semi_join_partial_stats_prefers_right_join_types() { + let join = join_with_stats( + empty_statistics(), + big_statistics(), + JoinType::LeftSemi, + PartitionMode::Partitioned, + ); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, JoinType::RightSemi, "r_col"); +} + +/// With `semi_join_swap_bias = 1`, `RightSemi`, `RightAnti`, or +/// `RightMark` is used unless statistics show that the filter side is larger +/// than the preserved side. +#[tokio::test] +async fn test_semi_join_bias_one_uses_right_unless_filter_is_larger() { + let mut config = ConfigOptions::new(); + config.optimizer.semi_join_swap_bias = 1.0; + + // The filter side is larger than the preserved side, so use Left*. + let join = join_with_stats( + sized_statistics(1000, 10_000), + sized_statistics(1500, 15_000), + JoinType::LeftSemi, + PartitionMode::Partitioned, + ); + let optimized = JoinSelection::new().optimize(join, &config).unwrap(); + assert_join_orientation(&optimized, JoinType::LeftSemi, "l_col"); + + // Equal statistics do not show that the filter side is larger, so use Right*. + let join = join_with_stats( + sized_statistics(1000, 10_000), + sized_statistics(1000, 10_000), + JoinType::LeftSemi, + PartitionMode::Partitioned, + ); + let optimized = JoinSelection::new().optimize(join, &config).unwrap(); + assert_join_orientation(&optimized, JoinType::RightSemi, "r_col"); + + // Absent statistics also use Right*. + let join = join_with_stats( + empty_statistics(), + empty_statistics(), + JoinType::LeftSemi, + PartitionMode::Partitioned, + ); + let optimized = JoinSelection::new().optimize(join, &config).unwrap(); + assert_join_orientation(&optimized, JoinType::RightSemi, "r_col"); +} + +/// A `semi_join_swap_bias` below 1 prefers `LeftSemi`, `LeftAnti`, or +/// `LeftMark`: the filter side must be smaller than `bias` times the preserved +/// side before the join is reoriented to Right*. When statistics are absent, +/// the configured `LeftSemi`/`LeftAnti`/`LeftMark` preference is used. +#[tokio::test] +async fn test_semi_join_bias_below_one_prefers_preserved_side() { + let mut config = ConfigOptions::new(); + config.optimizer.semi_join_swap_bias = 0.5; + + // The filter side is smaller than the preserved side, but not smaller + // than half of it, so the build stays on the preserved side (the + // symmetric comparison would have swapped here) + let join = join_with_stats( + sized_statistics(1000, 10_000), + sized_statistics(800, 8_000), + JoinType::LeftSemi, + PartitionMode::Partitioned, + ); + let optimized = JoinSelection::new().optimize(join, &config).unwrap(); + assert_join_orientation(&optimized, JoinType::LeftSemi, "l_col"); + + // The same sizes reorient a Right* join to its Left* counterpart + let join = join_with_stats( + sized_statistics(800, 8_000), + sized_statistics(1000, 10_000), + JoinType::RightSemi, + PartitionMode::Partitioned, + ); + let optimized = JoinSelection::new().optimize(join, &config).unwrap(); + assert_join_orientation(&optimized, JoinType::LeftSemi, "r_col"); + + // Absent statistics use the configured Left* preference + let join = join_with_stats( + empty_statistics(), + empty_statistics(), + JoinType::LeftSemi, + PartitionMode::Partitioned, + ); + let optimized = JoinSelection::new().optimize(join, &config).unwrap(); + assert_join_orientation(&optimized, JoinType::LeftSemi, "l_col"); +} + +/// Disabling `join_reordering` also disables semi join orientation. +#[tokio::test] +async fn test_semi_join_respects_join_reordering_flag() { + let mut config = ConfigOptions::new(); + config.optimizer.join_reordering = false; + + let join = join_with_stats( + sized_statistics(1000, 10_000), + sized_statistics(1500, 15_000), + JoinType::LeftSemi, + PartitionMode::Partitioned, + ); + let optimized = JoinSelection::new().optimize(join, &config).unwrap(); + assert_join_orientation(&optimized, JoinType::LeftSemi, "l_col"); +} + +/// Null-aware anti joins have fixed side requirements and are never +/// reoriented. +#[tokio::test] +async fn test_semi_join_null_aware_anti_never_swaps() { + let left: Arc = Arc::new(StatisticsExec::new( + empty_statistics(), + Schema::new(vec![Field::new("l_col", DataType::Int32, false)]), + )); + let right: Arc = Arc::new(StatisticsExec::new( + empty_statistics(), + Schema::new(vec![Field::new("r_col", DataType::Int32, false)]), + )); + let join = Arc::new( + HashJoinExec::try_new( + Arc::clone(&left), + Arc::clone(&right), + vec![( + Arc::new(Column::new_with_schema("l_col", &left.schema()).unwrap()), + Arc::new(Column::new_with_schema("r_col", &right.schema()).unwrap()), + )], + None, + &JoinType::LeftAnti, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + true, + ) + .unwrap(), + ); + let optimized = JoinSelection::new() + .optimize(join, &ConfigOptions::new()) + .unwrap(); + assert_join_orientation(&optimized, JoinType::LeftAnti, "l_col"); +} + /// Compare the input plan with the plan after running the probe order optimizer. macro_rules! assert_optimized { ($PLAN: expr, @$EXPECTED_LINES: literal $(,)?) => { @@ -1258,6 +1587,7 @@ async fn test_cases_without_collect_left_check() -> Result<()> { let mut cases = vec![]; let join_types = vec![JoinType::LeftSemi, JoinType::Inner]; for join_type in join_types { + let expects_filter_side_swap = matches!(join_type, JoinType::LeftSemi); cases.push(TestCase { case: "Unbounded - Bounded / CollectLeft".to_string(), initial_sources_unbounded: (SourceType::Unbounded, SourceType::Bounded), @@ -1294,9 +1624,13 @@ async fn test_cases_without_collect_left_check() -> Result<()> { initial_join_type: join_type, initial_mode: PartitionMode::CollectLeft, expected_sources_unbounded: (SourceType::Bounded, SourceType::Bounded), - expected_join_type: join_type, + expected_join_type: if expects_filter_side_swap { + join_type.swap() + } else { + join_type + }, expected_mode: PartitionMode::CollectLeft, - expecting_swap: false, + expecting_swap: expects_filter_side_swap, }); cases.push(TestCase { case: "Unbounded - Bounded / Partitioned".to_string(), @@ -1324,9 +1658,13 @@ async fn test_cases_without_collect_left_check() -> Result<()> { initial_join_type: join_type, initial_mode: PartitionMode::Partitioned, expected_sources_unbounded: (SourceType::Bounded, SourceType::Bounded), - expected_join_type: join_type, + expected_join_type: if expects_filter_side_swap { + join_type.swap() + } else { + join_type + }, expected_mode: PartitionMode::Partitioned, - expecting_swap: false, + expecting_swap: expects_filter_side_swap, }); cases.push(TestCase { case: "Unbounded - Unbounded / Partitioned".to_string(), @@ -1353,6 +1691,7 @@ async fn test_not_support_collect_left() -> Result<()> { // [JoinType::LeftSemi] let the_ones_not_support_collect_left = vec![JoinType::Left, JoinType::LeftAnti]; for join_type in the_ones_not_support_collect_left { + let expects_filter_side_swap = matches!(join_type, JoinType::LeftAnti); cases.push(TestCase { case: "Unbounded - Bounded".to_string(), initial_sources_unbounded: (SourceType::Unbounded, SourceType::Bounded), @@ -1379,9 +1718,13 @@ async fn test_not_support_collect_left() -> Result<()> { initial_join_type: join_type, initial_mode: PartitionMode::Partitioned, expected_sources_unbounded: (SourceType::Bounded, SourceType::Bounded), - expected_join_type: join_type, + expected_join_type: if expects_filter_side_swap { + join_type.swap() + } else { + join_type + }, expected_mode: PartitionMode::Partitioned, - expecting_swap: false, + expecting_swap: expects_filter_side_swap, }); cases.push(TestCase { case: "Unbounded - Unbounded".to_string(), diff --git a/datafusion/physical-optimizer/src/join_selection.rs b/datafusion/physical-optimizer/src/join_selection.rs index 74c6cbb19aea9..c0472ca8967e0 100644 --- a/datafusion/physical-optimizer/src/join_selection.rs +++ b/datafusion/physical-optimizer/src/join_selection.rs @@ -69,8 +69,13 @@ fn get_stats( } } -// TODO: We need some performance test for Right Semi/Right Join swap to Left Semi/Left Join in case that the right side is smaller but not much smaller. -// TODO: In PrestoSQL, the optimizer flips join sides only if one side is much smaller than the other by more than SIZE_DIFFERENCE_THRESHOLD times, by default is 8 times. +// TODO: Joins that use this symmetric comparison swap their inputs on any size +// difference, however small. In PrestoSQL, the optimizer flips join sides only +// when one side is more than SIZE_DIFFERENCE_THRESHOLD times smaller than the +// other (8 times by default). Semi, anti, and mark hash joins apply such a +// threshold via `semi_join_swap_bias`; evaluate whether the join types compared +// here would benefit from one as well, e.g. a Right join swapped to a Left join +// when the right side is smaller but not much smaller. /// Checks whether join inputs should be swapped using available statistics. /// /// It follows these steps: @@ -114,6 +119,109 @@ pub(crate) fn should_swap_join_order( } } +/// Decides whether `hash_join` should swap its left and right inputs, subject +/// to join-type legality, null-aware anti join restrictions, join reordering +/// settings, and input statistics. +/// +/// For semi, anti, and mark joins, one input's rows form the output (the +/// preserved side), while the other input determines which rows should be kept +/// (the filter side). `RightSemi`, `RightAnti`, and `RightMark` are usually +/// preferable for these joins: because hash joins build their left input, the +/// build side then carries no output columns, the preserved side streams +/// through the right/probe input, and matching rows are emitted incrementally +/// instead of being buffered until the probe side is exhausted. The optimizer +/// therefore prefers `RightSemi`, `RightAnti`, or `RightMark` unless statistics +/// show that the filter side is more than `semi_join_swap_bias` times larger +/// than the preserved side. If statistics are partial or absent, the ratio is +/// unknown, so the configured preference is used: `semi_join_swap_bias` values +/// greater than or equal to 1 use `RightSemi`/`RightAnti`/`RightMark`, while +/// values below 1 use `LeftSemi`/`LeftAnti`/`LeftMark`. +fn should_swap_hash_join_inputs( + hash_join: &HashJoinExec, + config: &ConfigOptions, + registry: Option<&StatisticsRegistry>, +) -> Result { + // Null-aware anti joins have specific side requirements, so never swap them. + if !config.optimizer.join_reordering + || !hash_join.join_type().supports_swap() + || hash_join.null_aware + { + return Ok(false); + } + + // Never move an unbounded input to the build side: the build side must be + // fully consumed before probing can begin. + if hash_join.right().boundedness().is_unbounded() { + return Ok(false); + } + + let left = &**hash_join.left(); + let right = &**hash_join.right(); + + // For joins that preserve only one side (semi / anti / mark joins), compare + // the filter-to-preserved side ratio, factoring in `semi_join_swap_bias`. + let bias = config.optimizer.semi_join_swap_bias; + let builds_on_preserved_side = matches!( + hash_join.join_type(), + JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark + ); + let builds_on_filter_side = matches!( + hash_join.join_type(), + JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark + ); + if builds_on_preserved_side || builds_on_filter_side { + let (filter, preserved) = if builds_on_preserved_side { + (right, left) + } else { + (left, right) + }; + let should_build_on_filter_side = + match size_ratio_exceeds_threshold(filter, preserved, bias, registry)? { + Some(filter_side_too_large) => !filter_side_too_large, + // If nothing is known about the size ratio, use the configured + // bias to pick the join orientation. + None => bias >= 1.0, + }; + + return Ok(should_build_on_filter_side != builds_on_filter_side); + } + + // Other hash join types fall back to the generic logic. + should_swap_join_order(left, right, config, registry) +} + +/// Returns whether the available statistics show that `lhs / rhs > threshold`, +/// comparing total byte sizes when both sides report them and falling back to +/// row counts otherwise. Returns `None` when statistics are missing on either +/// side, which shows nothing about the size ratio. +fn size_ratio_exceeds_threshold( + lhs: &dyn ExecutionPlan, + rhs: &dyn ExecutionPlan, + threshold: f64, + registry: Option<&StatisticsRegistry>, +) -> Result> { + let lhs_stats = get_stats(lhs, registry)?; + let rhs_stats = get_stats(rhs, registry)?; + + let exceeds = |lhs: &usize, rhs: &usize| (*lhs as f64) > threshold * (*rhs as f64); + + Ok( + match ( + lhs_stats.total_byte_size.get_value(), + rhs_stats.total_byte_size.get_value(), + ) { + (Some(lhs), Some(rhs)) => Some(exceeds(lhs, rhs)), + _ => match ( + lhs_stats.num_rows.get_value(), + rhs_stats.num_rows.get_value(), + ) { + (Some(lhs), Some(rhs)) => Some(exceeds(lhs, rhs)), + _ => None, + }, + }, + ) +} + fn supports_collect_by_thresholds( plan: &dyn ExecutionPlan, threshold_byte_size: usize, @@ -196,6 +304,7 @@ impl PhysicalOptimizerRule for JoinSelection { /// - `config.optimizer.hash_join_single_partition_threshold`: byte threshold for `CollectLeft` /// - `config.optimizer.hash_join_single_partition_threshold_rows`: row threshold for `CollectLeft` /// - `config.optimizer.join_reordering`: allows or forbids input swapping +/// - `config.optimizer.semi_join_swap_bias`: Right*/Left* preference for semi/anti/mark joins pub(crate) fn try_collect_left( hash_join: &HashJoinExec, ignore_threshold: bool, @@ -223,11 +332,7 @@ pub(crate) fn try_collect_left( match (left_can_collect, right_can_collect) { (true, true) => { - // Don't swap null-aware anti joins as they have specific side requirements - if hash_join.join_type().supports_swap() - && !hash_join.null_aware - && should_swap_join_order(&**left, &**right, config, registry)? - { + if should_swap_hash_join_inputs(hash_join, config, registry)? { Ok(Some(hash_join.swap_inputs(PartitionMode::CollectLeft)?)) } else { Ok(Some(Arc::new( @@ -267,18 +372,13 @@ pub(crate) fn try_collect_left( /// /// Used configurations inside arg `config` /// - `config.optimizer.join_reordering`: allows or forbids statistics-driven join swapping +/// - `config.optimizer.semi_join_swap_bias`: Right*/Left* preference for semi/anti/mark joins pub(crate) fn partitioned_hash_join( hash_join: &HashJoinExec, config: &ConfigOptions, registry: Option<&StatisticsRegistry>, ) -> Result> { - let left = hash_join.left(); - let right = hash_join.right(); - // Don't swap null-aware anti joins as they have specific side requirements - if hash_join.join_type().supports_swap() - && !hash_join.null_aware - && should_swap_join_order(&**left, &**right, config, registry)? - { + if should_swap_hash_join_inputs(hash_join, config, registry)? { hash_join.swap_inputs(PartitionMode::Partitioned) } else { // Null-aware anti joins must use CollectLeft mode because they track probe-side state @@ -308,6 +408,7 @@ pub(crate) fn partitioned_hash_join( /// - `config.optimizer.hash_join_single_partition_threshold`: byte threshold for `CollectLeft` /// - `config.optimizer.hash_join_single_partition_threshold_rows`: row threshold for `CollectLeft` /// - `config.optimizer.join_reordering`: allows or forbids input swapping +/// - `config.optimizer.semi_join_swap_bias`: Right*/Left* preference for semi/anti/mark joins fn statistical_join_selection_subrule( plan: Arc, config: &ConfigOptions, @@ -327,13 +428,7 @@ fn statistical_join_selection_subrule( )? } PartitionMode::Partitioned => { - let left = hash_join.left(); - let right = hash_join.right(); - // Don't swap null-aware anti joins as they have specific side requirements - if hash_join.join_type().supports_swap() - && !hash_join.null_aware - && should_swap_join_order(&**left, &**right, config, registry)? - { + if should_swap_hash_join_inputs(hash_join, config, registry)? { hash_join .swap_inputs(PartitionMode::Partitioned) .map(Some)? diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt index 0b93f6fc10177..c6c2a1aa4a53d 100644 --- a/datafusion/sqllogictest/test_files/cte.slt +++ b/datafusion/sqllogictest/test_files/cte.slt @@ -1027,7 +1027,7 @@ logical_plan 04)----SubqueryAlias: cte 05)------TableScan: person projection=[id] physical_plan -01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(id@0, id@0)] +01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)] 02)--DataSourceExec: partitions=1, partition_sizes=[0] 03)--DataSourceExec: partitions=1, partition_sizes=[0] diff --git a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt index e779ce2cbffb0..9bcb6a123fb17 100644 --- a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt +++ b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt @@ -286,9 +286,12 @@ physical_plan 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet 03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] -# LEFT SEMI JOIN (physical LeftSemi): reverse table roles so optimizer keeps LeftSemi -# (right_parquet has 3 rows < left_parquet has 5 rows, so no swap occurs). +# LEFT SEMI JOIN (physical LeftSemi): set the bias to 1 so the larger filter +# side (left_parquet with 5 rows) causes the optimizer to choose LeftSemi. # Physical LeftSemi generates a self-generated dynamic filter on the probe side. +statement ok +set datafusion.optimizer.semi_join_swap_bias = 1; + query TT EXPLAIN SELECT r.* FROM right_parquet r @@ -316,6 +319,9 @@ WHERE r.id IN (SELECT l.id FROM left_parquet l); 3 right3 5 right5 +statement ok +reset datafusion.optimizer.semi_join_swap_bias; + # LEFT SEMI JOIN with ORDER BY: the join preserves probe-side ordering, so the # sort is pushed below the join. The join-generated dynamic filter should then # be pushed through the SortExec into the parquet scan predicate. @@ -385,8 +391,12 @@ physical_plan 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet 03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] -# LEFT MARK JOIN: the OR prevents decorrelation to LeftSemi, so the optimizer -# uses LeftMark. Self-generated dynamic filter pushes to the probe side. +# LEFT MARK JOIN: the OR prevents decorrelation to LeftSemi. Set the bias to 1 +# so the larger filter side (left_parquet with 5 rows) causes the optimizer to +# choose LeftMark. Self-generated dynamic filter pushes to the probe side. +statement ok +set datafusion.optimizer.semi_join_swap_bias = 1; + query TT EXPLAIN SELECT r.id, r.info FROM right_parquet r @@ -420,6 +430,9 @@ WHERE EXISTS (SELECT 1 FROM left_parquet l WHERE r.id = l.id) 3 right3 5 right5 +statement ok +reset datafusion.optimizer.semi_join_swap_bias; + # Test 2c: Parent dynamic filter (from TopK) pushed through semi/anti joins # Sort on the join key (id) so the TopK dynamic filter pushes to BOTH sides. diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 04ee70b963ceb..1545ab9b3b590 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -336,6 +336,7 @@ datafusion.optimizer.repartition_file_scans true datafusion.optimizer.repartition_joins true datafusion.optimizer.repartition_sorts true datafusion.optimizer.repartition_windows true +datafusion.optimizer.semi_join_swap_bias 2 datafusion.optimizer.skip_failed_rules false datafusion.optimizer.subset_repartition_threshold 4 datafusion.optimizer.top_down_join_key_reordering true @@ -493,6 +494,7 @@ datafusion.optimizer.repartition_file_scans true When set to `true`, datasource datafusion.optimizer.repartition_joins true Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level datafusion.optimizer.repartition_sorts true Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below ```text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` would turn into the plan below which performs better in multithreaded environments ```text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` datafusion.optimizer.repartition_windows true Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level +datafusion.optimizer.semi_join_swap_bias 2 The bias when choosing between `RightSemi`/`RightAnti`/`RightMark` and `LeftSemi`/`LeftAnti`/`LeftMark` for semi, anti, and mark hash joins. For these joins, one input's rows form the output ("preserved side"), while the other input determines which rows should be kept ("filter side"). A `RightSemi` hash join builds the hash table on the filter side and streams the preserved side; a `LeftSemi` hash join does the inverse. `RightSemi`, `RightAnti`, or `RightMark` is used unless statistics show that the filter side is more than this factor larger than the preserved side, comparing estimated total byte sizes when both sides report them and row counts otherwise. When statistics are missing on either side, the configured preference is used: a bias value greater than or equal to 1 uses `RightSemi`/`RightAnti`/`RightMark`, while a value below 1 uses `LeftSemi`/`LeftAnti`/`LeftMark`. datafusion.optimizer.skip_failed_rules false When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail datafusion.optimizer.subset_repartition_threshold 4 Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): ```text Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a]) If current partitions (3) < threshold (4), repartition: AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)] RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3 AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3) If current partitions (8) >= threshold (4), use subset satisfaction: AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ``` datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys diff --git a/datafusion/sqllogictest/test_files/join_limit_pushdown.slt b/datafusion/sqllogictest/test_files/join_limit_pushdown.slt index 933b03e7ebd93..fe257485a33b1 100644 --- a/datafusion/sqllogictest/test_files/join_limit_pushdown.slt +++ b/datafusion/sqllogictest/test_files/join_limit_pushdown.slt @@ -139,7 +139,7 @@ logical_plan 04)----SubqueryAlias: __correlated_sq_1 05)------TableScan: t1 projection=[a] physical_plan -01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(x@0, a@0)], fetch=2 +01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, x@0)], fetch=2 02)--DataSourceExec: partitions=1, partition_sizes=[1] 03)--DataSourceExec: partitions=1, partition_sizes=[1] @@ -160,7 +160,7 @@ logical_plan 04)----SubqueryAlias: __correlated_sq_1 05)------TableScan: t1 projection=[a] physical_plan -01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(x@0, a@0)], fetch=1 +01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(a@0, x@0)], fetch=1 02)--DataSourceExec: partitions=1, partition_sizes=[1] 03)--DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 9be1d39d63605..ecd11ddebc64f 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1340,7 +1340,7 @@ physical_plan 01)AggregateExec: mode=FinalPartitioned, gby=[t1_id@0 as t1_id], aggr=[] 02)--RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 03)----AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[] -04)------HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(t1_id@0, t2_id@0)] +04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] 05)--------DataSourceExec: partitions=1, partition_sizes=[1] 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -1460,7 +1460,7 @@ physical_plan 05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[] 06)----------RepartitionExec: partitioning=Hash([alias1@0], 2), input_partitions=2 07)------------AggregateExec: mode=Partial, gby=[t1_id@0 as alias1], aggr=[] -08)--------------HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(t1_id@0, t2_id@0)] +08)--------------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] 09)----------------DataSourceExec: partitions=1, partition_sizes=[1] 10)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 11)------------------DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt index 40bfe79dcc633..a4affe161af51 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt @@ -887,9 +887,9 @@ WHERE EXISTS ( ); ---- Plan with Metrics -01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(a@0, a@0), (b@1, b@1)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=2, input_rows=4, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=19.58% (196/1.00 K)] -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_probe.parquet]]}, projection=[a, b], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=14.89% (154/1.03 K)] +01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0), (b@1, b@1)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=4, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_probe.parquet]]}, projection=[a, b], file_type=parquet, metrics=[output_rows=4, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=14.89% (154/1.03 K)] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_build.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ad AND b@1 >= ba AND b@1 <= bd AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}, {c0:ac,c1:bc}, {c0:ad,c1:bd}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ad AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bd, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=0, predicate_cache_inner_records=4, predicate_cache_records=4, scan_efficiency_ratio=19.58% (196/1.00 K)] statement ok reset datafusion.explain.analyze_categories; diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part index cd86b618f03b0..c3b71bb8ff48f 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part @@ -81,29 +81,29 @@ physical_plan 06)----------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] 07)------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 08)--------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] -09)----------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_nationkey@3, n_nationkey@0)], projection=[ps_partkey@0, ps_availqty@1, ps_supplycost@2] -10)------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4 -11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_availqty@2, ps_supplycost@3, s_nationkey@5] -12)----------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4 -13)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false -14)----------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 -15)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -16)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -17)--------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0] -18)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -19)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +09)----------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(n_nationkey@0, s_nationkey@3)], projection=[ps_partkey@0, ps_availqty@1, ps_supplycost@2] +10)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +11)--------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0] +12)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +14)------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4 +15)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_availqty@2, ps_supplycost@3, s_nationkey@5] +16)----------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4 +17)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false +18)----------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 +19)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false 20)--ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] 21)----AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] 22)------CoalescePartitionsExec 23)--------AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] -24)----------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_availqty@0, ps_supplycost@1] -25)------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 -26)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@0, s_suppkey@0)], projection=[ps_availqty@1, ps_supplycost@2, s_nationkey@4] -27)----------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4 -28)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false -29)----------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 -30)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -31)------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -32)--------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0] -33)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -34)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +24)----------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(n_nationkey@0, s_nationkey@2)], projection=[ps_availqty@0, ps_supplycost@1] +25)------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +26)--------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0] +27)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +28)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +29)------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 +30)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@0, s_suppkey@0)], projection=[ps_availqty@1, ps_supplycost@2, s_nationkey@4] +31)----------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4 +32)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false +33)----------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 +34)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part index 3602aa1f4a8ed..6bd7f4f6f1634 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part @@ -70,18 +70,18 @@ physical_plan 01)SortPreservingMergeExec: [o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST] 02)--SortExec: expr=[o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST], preserve_partitioning=[true] 03)----AggregateExec: mode=SinglePartitioned, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)] -04)------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@2, l_orderkey@0)] -05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6] -06)----------RepartitionExec: partitioning=Hash([o_orderkey@2], 4), input_partitions=4 -07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5] -08)--------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -09)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:0..606529], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:606529..1213058], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1213058..1819587], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1819587..2426114]]}, projection=[c_custkey, c_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -10)--------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4 -11)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -12)----------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -13)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false -14)--------FilterExec: sum(lineitem.l_quantity)@1 > 300.00, projection=[l_orderkey@0] -15)----------AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] -16)------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -17)--------------AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] -18)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +04)------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_orderkey@0, o_orderkey@2)] +05)--------FilterExec: sum(lineitem.l_quantity)@1 > 300.00, projection=[l_orderkey@0] +06)----------AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] +07)------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +08)--------------AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] +09)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +10)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6] +11)----------RepartitionExec: partitioning=Hash([o_orderkey@2], 4), input_partitions=4 +12)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5] +13)--------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 +14)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:0..606529], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:606529..1213058], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1213058..1819587], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1819587..2426114]]}, projection=[c_custkey, c_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +15)--------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4 +16)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +17)----------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +18)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part index 7ef36a72eca26..f0147f90ab10c 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part @@ -68,10 +68,10 @@ physical_plan 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * 1 - lineitem.l_discount) as sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * 1 - lineitem.l_discount) as sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -05)--------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND l_quantity@0 >= 1.00 AND l_quantity@0 <= 11.00 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND l_quantity@0 >= 10.00 AND l_quantity@0 <= 20.00 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND l_quantity@0 >= 20.00 AND l_quantity@0 <= 30.00 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3] -06)----------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 -07)------------FilterExec: (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON AND (l_quantity@1 >= 1.00 AND l_quantity@1 <= 11.00 OR l_quantity@1 >= 10.00 AND l_quantity@1 <= 20.00 OR l_quantity@1 >= 20.00 AND l_quantity@1 <= 30.00), projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] -08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false -09)----------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -10)------------FilterExec: p_size@2 >= 1 AND (p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) -11)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:0..597773], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:597773..1195546], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1195546..1793319], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1793319..2391090]]}, projection=[p_partkey, p_brand, p_size, p_container], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +05)--------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(p_partkey@0, l_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND l_quantity@0 >= 1.00 AND l_quantity@0 <= 11.00 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND l_quantity@0 >= 10.00 AND l_quantity@0 <= 20.00 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND l_quantity@0 >= 20.00 AND l_quantity@0 <= 30.00 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3] +06)----------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +07)------------FilterExec: p_size@2 >= 1 AND (p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) +08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:0..597773], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:597773..1195546], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1195546..1793319], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1793319..2391090]]}, projection=[p_partkey, p_brand, p_size, p_container], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +09)----------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 +10)------------FilterExec: (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON AND (l_quantity@1 >= 1.00 AND l_quantity@1 <= 11.00 OR l_quantity@1 >= 10.00 AND l_quantity@1 <= 20.00 OR l_quantity@1 >= 20.00 AND l_quantity@1 <= 30.00), projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] +11)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part index b6fa1c4806bf4..bc87de458b344 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part @@ -101,45 +101,45 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], fetch=10 02)--SortExec: TopK(fetch=10), expr=[s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], preserve_partitioning=[true] -03)----HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[s_acctbal@5, s_name@2, n_name@8, p_partkey@0, p_mfgr@1, s_address@3, s_phone@4, s_comment@6] -04)------RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 4), input_partitions=4 -05)--------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(n_regionkey@9, r_regionkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, ps_supplycost@7, n_name@8] -06)----------RepartitionExec: partitioning=Hash([n_regionkey@9], 4), input_partitions=4 -07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@4, n_nationkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@10, n_regionkey@11] -08)--------------RepartitionExec: partitioning=Hash([s_nationkey@4], 4), input_partitions=4 -09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@2, s_suppkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@5, s_address@6, s_nationkey@7, s_phone@8, s_acctbal@9, s_comment@10, ps_supplycost@3] -10)------------------RepartitionExec: partitioning=Hash([ps_suppkey@2], 4), input_partitions=4 -11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4] -12)----------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -13)------------------------FilterExec: p_size@3 = 15 AND p_type@2 LIKE %BRASS, projection=[p_partkey@0, p_mfgr@1] -14)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:0..597773], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:597773..1195546], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1195546..1793319], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1793319..2391090]]}, projection=[p_partkey, p_mfgr, p_type, p_size], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -15)----------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 -16)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false -17)------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 -18)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -19)--------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 -20)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -21)----------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 -22)------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0] -23)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -24)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -25)------RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 4), input_partitions=4 -26)--------ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] -27)----------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] -28)------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 -29)--------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] -30)----------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(n_regionkey@2, r_regionkey@0)], projection=[ps_partkey@0, ps_supplycost@1] -31)------------------RepartitionExec: partitioning=Hash([n_regionkey@2], 4), input_partitions=4 -32)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_partkey@0, ps_supplycost@1, n_regionkey@4] -33)----------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 -34)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_supplycost@2, s_nationkey@4] -35)--------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4 -36)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false -37)--------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 -38)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -39)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 -40)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -41)------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 -42)--------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0] -43)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -44)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +03)----HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(ps_partkey@1, p_partkey@0), (min(partsupp.ps_supplycost)@0, ps_supplycost@7)], projection=[s_acctbal@5, s_name@2, n_name@8, p_partkey@0, p_mfgr@1, s_address@3, s_phone@4, s_comment@6] +04)------RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 4), input_partitions=4 +05)--------ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] +06)----------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] +07)------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 +08)--------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] +09)----------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(r_regionkey@0, n_regionkey@2)], projection=[ps_partkey@0, ps_supplycost@1] +10)------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 +11)--------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0] +12)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +14)------------------RepartitionExec: partitioning=Hash([n_regionkey@2], 4), input_partitions=4 +15)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_partkey@0, ps_supplycost@1, n_regionkey@4] +16)----------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 +17)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_supplycost@2, s_nationkey@4] +18)--------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4 +19)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false +20)--------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 +21)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +22)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 +23)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +24)------RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 4), input_partitions=4 +25)--------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(r_regionkey@0, n_regionkey@9)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, ps_supplycost@7, n_name@8] +26)----------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 +27)------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0] +28)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +29)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +30)----------RepartitionExec: partitioning=Hash([n_regionkey@9], 4), input_partitions=4 +31)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@4, n_nationkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@10, n_regionkey@11] +32)--------------RepartitionExec: partitioning=Hash([s_nationkey@4], 4), input_partitions=4 +33)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@2, s_suppkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@5, s_address@6, s_nationkey@7, s_phone@8, s_acctbal@9, s_comment@10, ps_supplycost@3] +34)------------------RepartitionExec: partitioning=Hash([ps_suppkey@2], 4), input_partitions=4 +35)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4] +36)----------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +37)------------------------FilterExec: p_size@3 = 15 AND p_type@2 LIKE %BRASS, projection=[p_partkey@0, p_mfgr@1] +38)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:0..597773], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:597773..1195546], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1195546..1793319], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1793319..2391090]]}, projection=[p_partkey, p_mfgr, p_type, p_size], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +39)----------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 +40)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false +41)------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 +42)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +43)--------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 +44)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part index e038a7482d24f..b1020d45903f7 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part @@ -83,27 +83,27 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] 02)--SortExec: expr=[s_name@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_name@1, s_address@2] -04)------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4 -05)--------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_nationkey@3, n_nationkey@0)], projection=[s_suppkey@0, s_name@1, s_address@2] -06)----------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=1 -07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -08)----------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -09)------------FilterExec: n_name@1 = CANADA, projection=[n_nationkey@0] -10)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -11)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -12)------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4 -13)--------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(ps_partkey@0, l_partkey@1), (ps_suppkey@1, l_suppkey@2)], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * sum(lineitem.l_quantity)@1, projection=[ps_suppkey@1] -14)----------RepartitionExec: partitioning=Hash([ps_partkey@0, ps_suppkey@1], 4), input_partitions=4 -15)------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(ps_partkey@0, p_partkey@0)] -16)--------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 -17)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false -18)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -19)----------------FilterExec: p_name@1 LIKE forest%, projection=[p_partkey@0] -20)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:0..597773], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:597773..1195546], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1195546..1793319], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1793319..2391090]]}, projection=[p_partkey, p_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -21)----------ProjectionExec: expr=[0.5 * CAST(sum(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * sum(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey] -22)------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] -23)--------------RepartitionExec: partitioning=Hash([l_partkey@0, l_suppkey@1], 4), input_partitions=4 -24)----------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] -25)------------------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2] -26)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +03)----HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(ps_suppkey@0, s_suppkey@0)], projection=[s_name@1, s_address@2] +04)------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4 +05)--------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_partkey@1, ps_partkey@0), (l_suppkey@2, ps_suppkey@1)], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * sum(lineitem.l_quantity)@1, projection=[ps_suppkey@1] +06)----------ProjectionExec: expr=[0.5 * CAST(sum(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * sum(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey] +07)------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] +08)--------------RepartitionExec: partitioning=Hash([l_partkey@0, l_suppkey@1], 4), input_partitions=4 +09)----------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] +10)------------------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2] +11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +12)----------RepartitionExec: partitioning=Hash([ps_partkey@0, ps_suppkey@1], 4), input_partitions=4 +13)------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(p_partkey@0, ps_partkey@0)] +14)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +15)----------------FilterExec: p_name@1 LIKE forest%, projection=[p_partkey@0] +16)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:0..597773], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:597773..1195546], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1195546..1793319], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1793319..2391090]]}, projection=[p_partkey, p_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +17)--------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 +18)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty], constraints=[PrimaryKey([0, 1])], file_type=csv, has_header=false +19)------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4 +20)--------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(n_nationkey@0, s_nationkey@3)], projection=[s_suppkey@0, s_name@1, s_address@2] +21)----------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +22)------------FilterExec: n_name@1 = CANADA, projection=[n_nationkey@0] +23)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +24)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +25)----------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=1 +26)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part index 812f5d2cba56b..09e812e5b8bb2 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part @@ -95,28 +95,28 @@ physical_plan 04)------AggregateExec: mode=FinalPartitioned, gby=[s_name@0 as s_name], aggr=[count(Int64(1))] 05)--------RepartitionExec: partitioning=Hash([s_name@0], 4), input_partitions=4 06)----------AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(Int64(1))] -07)------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0] -08)--------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0 -09)----------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4 -10)------------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_nationkey@1, n_nationkey@0)], projection=[s_name@0, l_orderkey@2, l_suppkey@3] -11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@1], 4), input_partitions=4 -12)----------------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@2, o_orderkey@0)] -13)------------------------RepartitionExec: partitioning=Hash([l_orderkey@2], 4), input_partitions=4 -14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] -15)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 -16)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -17)----------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4 -18)------------------------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] -19)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false -20)------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4 -21)--------------------------FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0] -22)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderstatus], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -23)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -24)----------------------FilterExec: n_name@1 = SAUDI ARABIA, projection=[n_nationkey@0] -25)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -26)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -27)----------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -28)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false -29)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -30)----------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] -31)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +07)------------HashJoinExec: mode=Partitioned, join_type=RightAnti, on=[(l_orderkey@0, l_orderkey@1)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0] +08)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +09)----------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] +10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +11)--------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_orderkey@0, l_orderkey@1)], filter=l_suppkey@1 != l_suppkey@0 +12)----------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +13)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +14)----------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4 +15)------------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(n_nationkey@0, s_nationkey@1)], projection=[s_name@0, l_orderkey@2, l_suppkey@3] +16)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +17)----------------------FilterExec: n_name@1 = SAUDI ARABIA, projection=[n_nationkey@0] +18)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +19)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +20)--------------------RepartitionExec: partitioning=Hash([s_nationkey@1], 4), input_partitions=4 +21)----------------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(o_orderkey@0, l_orderkey@2)] +22)------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4 +23)--------------------------FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0] +24)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderstatus], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +25)------------------------RepartitionExec: partitioning=Hash([l_orderkey@2], 4), input_partitions=4 +26)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] +27)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 +28)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +29)----------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4 +30)------------------------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] +31)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part index 40fa8939c2970..0e9a4af12d7e2 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part @@ -80,12 +80,12 @@ physical_plan 06)----------RepartitionExec: partitioning=Hash([cntrycode@0], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)] 08)--------------ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal] -09)----------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2] -10)------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -11)--------------------FilterExec: substr(c_phone@1, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]) AND CAST(c_acctbal@2 AS Decimal128(19, 6)) > scalar_subquery() -12)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:0..606529], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:606529..1213058], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1213058..1819587], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1819587..2426114]]}, projection=[c_custkey, c_phone, c_acctbal], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -13)------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4 -14)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +09)----------------HashJoinExec: mode=Partitioned, join_type=RightAnti, on=[(o_custkey@0, c_custkey@0)], projection=[c_phone@1, c_acctbal@2] +10)------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4 +11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +12)------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 +13)--------------------FilterExec: substr(c_phone@1, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]) AND CAST(c_acctbal@2 AS Decimal128(19, 6)) > scalar_subquery() +14)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:0..606529], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:606529..1213058], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1213058..1819587], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1819587..2426114]]}, projection=[c_custkey, c_phone, c_acctbal], constraints=[PrimaryKey([0])], file_type=csv, has_header=false 15)--AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] 16)----CoalescePartitionsExec 17)------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part index 1bc1b1fefbdad..f53d7821167b6 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part @@ -59,10 +59,10 @@ physical_plan 04)------AggregateExec: mode=FinalPartitioned, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(Int64(1))] 05)--------RepartitionExec: partitioning=Hash([o_orderpriority@0], 4), input_partitions=4 06)----------AggregateExec: mode=Partial, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(Int64(1))] -07)------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderpriority@1] -08)--------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4 -09)----------------FilterExec: o_orderdate@1 >= 1993-07-01 AND o_orderdate@1 < 1993-10-01, projection=[o_orderkey@0, o_orderpriority@2] -10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate, o_orderpriority], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -11)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -12)----------------FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0] -13)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_commitdate, l_receiptdate], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +07)------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_orderkey@0, o_orderkey@0)], projection=[o_orderpriority@1] +08)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +09)----------------FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0] +10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_commitdate, l_receiptdate], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +11)--------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4 +12)----------------FilterExec: o_orderdate@1 >= 1993-07-01 AND o_orderdate@1 < 1993-10-01, projection=[o_orderkey@0, o_orderpriority@2] +13)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate, o_orderpriority], constraints=[PrimaryKey([0])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part index 036c0e3b8c137..da313964f950d 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part @@ -73,27 +73,27 @@ physical_plan 04)------AggregateExec: mode=FinalPartitioned, gby=[n_name@0 as n_name], aggr=[sum(lineitem.l_extendedprice * 1 - lineitem.l_discount) as sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] 05)--------RepartitionExec: partitioning=Hash([n_name@0], 4), input_partitions=4 06)----------AggregateExec: mode=Partial, gby=[n_name@2 as n_name], aggr=[sum(lineitem.l_extendedprice * 1 - lineitem.l_discount) as sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -07)------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@2] -08)--------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4 -09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@4, n_regionkey@5] -10)------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 -11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0), (c_nationkey@0, s_nationkey@1)], projection=[l_extendedprice@2, l_discount@3, s_nationkey@5] -12)----------------------RepartitionExec: partitioning=Hash([l_suppkey@1, c_nationkey@0], 4), input_partitions=4 -13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@1, l_orderkey@0)], projection=[c_nationkey@0, l_suppkey@3, l_extendedprice@4, l_discount@5] -14)--------------------------RepartitionExec: partitioning=Hash([o_orderkey@1], 4), input_partitions=4 -15)----------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_nationkey@1, o_orderkey@2] -16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -17)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:0..606529], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:606529..1213058], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1213058..1819587], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1819587..2426114]]}, projection=[c_custkey, c_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -18)------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4 -19)--------------------------------FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1] -20)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -21)--------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -22)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false -23)----------------------RepartitionExec: partitioning=Hash([s_suppkey@0, s_nationkey@1], 4), input_partitions=1 -24)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -25)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 -26)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -27)--------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 -28)----------------FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] -29)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -30)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +07)------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(r_regionkey@0, n_regionkey@3)], projection=[l_extendedprice@0, l_discount@1, n_name@2] +08)--------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 +09)----------------FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] +10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +12)--------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4 +13)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@4, n_regionkey@5] +14)------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 +15)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0), (c_nationkey@0, s_nationkey@1)], projection=[l_extendedprice@2, l_discount@3, s_nationkey@5] +16)----------------------RepartitionExec: partitioning=Hash([l_suppkey@1, c_nationkey@0], 4), input_partitions=4 +17)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@1, l_orderkey@0)], projection=[c_nationkey@0, l_suppkey@3, l_extendedprice@4, l_discount@5] +18)--------------------------RepartitionExec: partitioning=Hash([o_orderkey@1], 4), input_partitions=4 +19)----------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_nationkey@1, o_orderkey@2] +20)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 +21)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:0..606529], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:606529..1213058], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1213058..1819587], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1819587..2426114]]}, projection=[c_custkey, c_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +22)------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4 +23)--------------------------------FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1] +24)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +25)--------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +26)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +27)----------------------RepartitionExec: partitioning=Hash([s_suppkey@0, s_nationkey@1], 4), input_partitions=1 +28)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +29)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 +30)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part index 902413e9efb28..e80b0b05d4977 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part @@ -97,36 +97,36 @@ physical_plan 05)--------RepartitionExec: partitioning=Hash([o_year@0], 4), input_partitions=4 06)----------AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = BRAZIL THEN all_nations.volume ELSE 0.0000 END) as sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)] 07)------------ProjectionExec: expr=[date_part(YEAR, o_orderdate@0) as o_year, l_extendedprice@1 * (1 - l_discount@2) as volume, n_name@3 as nation] -08)--------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(n_regionkey@3, r_regionkey@0)], projection=[o_orderdate@2, l_extendedprice@0, l_discount@1, n_name@4] -09)----------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4 -10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@3, n_regionkey@4, n_name@6] -11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 -12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@4, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@3, n_regionkey@6] -13)------------------------RepartitionExec: partitioning=Hash([c_nationkey@4], 4), input_partitions=4 -14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@3, c_custkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@4, c_nationkey@6] -15)----------------------------RepartitionExec: partitioning=Hash([o_custkey@3], 4), input_partitions=4 -16)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_extendedprice@1, l_discount@2, s_nationkey@3, o_custkey@5, o_orderdate@6] -17)--------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0)], projection=[l_orderkey@0, l_extendedprice@2, l_discount@3, s_nationkey@5] -19)------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4 -20)--------------------------------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@0, l_suppkey@2, l_extendedprice@3, l_discount@4] -21)----------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -22)------------------------------------------FilterExec: p_type@1 = ECONOMY ANODIZED STEEL, projection=[p_partkey@0] -23)--------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:0..597773], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:597773..1195546], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1195546..1793319], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1793319..2391090]]}, projection=[p_partkey, p_type], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -24)----------------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4 -25)------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false -26)------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 -27)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -28)--------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4 -29)----------------------------------FilterExec: o_orderdate@2 >= 1995-01-01 AND o_orderdate@2 <= 1996-12-31 -30)------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -31)----------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -32)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:0..606529], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:606529..1213058], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1213058..1819587], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1819587..2426114]]}, projection=[c_custkey, c_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -33)------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 -34)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -35)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 -36)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false -37)----------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 -38)------------------FilterExec: r_name@1 = AMERICA, projection=[r_regionkey@0] -39)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -40)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +08)--------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(r_regionkey@0, n_regionkey@3)], projection=[o_orderdate@2, l_extendedprice@0, l_discount@1, n_name@4] +09)----------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 +10)------------------FilterExec: r_name@1 = AMERICA, projection=[r_regionkey@0] +11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +13)----------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4 +14)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@3, n_regionkey@4, n_name@6] +15)--------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 +16)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@4, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@3, n_regionkey@6] +17)------------------------RepartitionExec: partitioning=Hash([c_nationkey@4], 4), input_partitions=4 +18)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@3, c_custkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@4, c_nationkey@6] +19)----------------------------RepartitionExec: partitioning=Hash([o_custkey@3], 4), input_partitions=4 +20)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_extendedprice@1, l_discount@2, s_nationkey@3, o_custkey@5, o_orderdate@6] +21)--------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +22)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0)], projection=[l_orderkey@0, l_extendedprice@2, l_discount@3, s_nationkey@5] +23)------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4 +24)--------------------------------------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@0, l_suppkey@2, l_extendedprice@3, l_discount@4] +25)----------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +26)------------------------------------------FilterExec: p_type@1 = ECONOMY ANODIZED STEEL, projection=[p_partkey@0] +27)--------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:0..597773], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:597773..1195546], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1195546..1793319], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl:1793319..2391090]]}, projection=[p_partkey, p_type], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +28)----------------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4 +29)------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount], constraints=[PrimaryKey([0, 3])], file_type=csv, has_header=false +30)------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1 +31)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +32)--------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4 +33)----------------------------------FilterExec: o_orderdate@2 >= 1995-01-01 AND o_orderdate@2 <= 1996-12-31 +34)------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +35)----------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 +36)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:0..606529], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:606529..1213058], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1213058..1819587], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl:1819587..2426114]]}, projection=[c_custkey, c_nationkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +37)------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 +38)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], constraints=[PrimaryKey([0])], file_type=csv, has_header=false +39)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1 +40)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], constraints=[PrimaryKey([0])], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index a48ede604968b..4965225cb7a27 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -387,16 +387,14 @@ physical_plan 10)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 11)------------DataSourceExec: partitions=1, partition_sizes=[1] 12)--ProjectionExec: expr=[CAST(id@0 AS Int32) as id, name@1 as name] -13)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1], NullsEqual: true -14)------CoalescePartitionsExec -15)--------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)] -16)----------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[] -17)------------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4 -18)--------------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[] -19)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -20)------------------DataSourceExec: partitions=1, partition_sizes=[1] -21)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -22)--------DataSourceExec: partitions=1, partition_sizes=[1] +13)----HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(id@0, CAST(t2.id AS Int32)@2), (name@1, name@1)], projection=[id@0, name@1], NullsEqual: true +14)------DataSourceExec: partitions=1, partition_sizes=[1] +15)------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)] +16)--------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[] +17)----------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4 +18)------------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[] +19)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +20)----------------DataSourceExec: partitions=1, partition_sizes=[1] query IT rowsort @@ -441,25 +439,21 @@ logical_plan 08)------TableScan: t2 projection=[name] 09)----TableScan: t1 projection=[name] physical_plan -01)UnionExec -02)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true -03)----CoalescePartitionsExec -04)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[] -05)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4 -06)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[] -07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -08)--------------DataSourceExec: partitions=1, partition_sizes=[1] -09)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -10)------DataSourceExec: partitions=1, partition_sizes=[1] -11)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true -12)----CoalescePartitionsExec -13)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[] -14)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4 -15)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[] -16)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -17)--------------DataSourceExec: partitions=1, partition_sizes=[1] -18)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -19)------DataSourceExec: partitions=1, partition_sizes=[1] +01)InterleaveExec +02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(name@0, name@0)], NullsEqual: true +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)----AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[] +05)------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4 +06)--------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[] +07)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +08)------------DataSourceExec: partitions=1, partition_sizes=[1] +09)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(name@0, name@0)], NullsEqual: true +10)----DataSourceExec: partitions=1, partition_sizes=[1] +11)----AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[] +12)------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4 +13)--------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[] +14)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +15)------------DataSourceExec: partitions=1, partition_sizes=[1] # union_upcast_types query TT diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index abf1c39510e97..93c90cdc02a70 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -173,6 +173,7 @@ The following configuration settings are available: | datafusion.optimizer.enable_piecewise_merge_join | false | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter. | | datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | | datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | +| datafusion.optimizer.semi_join_swap_bias | 2 | The bias when choosing between `RightSemi`/`RightAnti`/`RightMark` and `LeftSemi`/`LeftAnti`/`LeftMark` for semi, anti, and mark hash joins. For these joins, one input's rows form the output ("preserved side"), while the other input determines which rows should be kept ("filter side"). A `RightSemi` hash join builds the hash table on the filter side and streams the preserved side; a `LeftSemi` hash join does the inverse. `RightSemi`, `RightAnti`, or `RightMark` is used unless statistics show that the filter side is more than this factor larger than the preserved side, comparing estimated total byte sizes when both sides report them and row counts otherwise. When statistics are missing on either side, the configured preference is used: a bias value greater than or equal to 1 uses `RightSemi`/`RightAnti`/`RightMark`, while a value below 1 uses `LeftSemi`/`LeftAnti`/`LeftMark`. | | datafusion.optimizer.hash_join_inlist_pushdown_max_size | 131072 | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` \* `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins. | | datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values | 150 | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: | | datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | From f6a4b547dcf1aae373f6d29290fbba8df6fb58da Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 15 Jun 2026 11:39:48 -0400 Subject: [PATCH 2/4] Run prettier --- docs/source/user-guide/configs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 93c90cdc02a70..2478547f88df1 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -173,7 +173,7 @@ The following configuration settings are available: | datafusion.optimizer.enable_piecewise_merge_join | false | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter. | | datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | | datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | -| datafusion.optimizer.semi_join_swap_bias | 2 | The bias when choosing between `RightSemi`/`RightAnti`/`RightMark` and `LeftSemi`/`LeftAnti`/`LeftMark` for semi, anti, and mark hash joins. For these joins, one input's rows form the output ("preserved side"), while the other input determines which rows should be kept ("filter side"). A `RightSemi` hash join builds the hash table on the filter side and streams the preserved side; a `LeftSemi` hash join does the inverse. `RightSemi`, `RightAnti`, or `RightMark` is used unless statistics show that the filter side is more than this factor larger than the preserved side, comparing estimated total byte sizes when both sides report them and row counts otherwise. When statistics are missing on either side, the configured preference is used: a bias value greater than or equal to 1 uses `RightSemi`/`RightAnti`/`RightMark`, while a value below 1 uses `LeftSemi`/`LeftAnti`/`LeftMark`. | +| datafusion.optimizer.semi_join_swap_bias | 2 | The bias when choosing between `RightSemi`/`RightAnti`/`RightMark` and `LeftSemi`/`LeftAnti`/`LeftMark` for semi, anti, and mark hash joins. For these joins, one input's rows form the output ("preserved side"), while the other input determines which rows should be kept ("filter side"). A `RightSemi` hash join builds the hash table on the filter side and streams the preserved side; a `LeftSemi` hash join does the inverse. `RightSemi`, `RightAnti`, or `RightMark` is used unless statistics show that the filter side is more than this factor larger than the preserved side, comparing estimated total byte sizes when both sides report them and row counts otherwise. When statistics are missing on either side, the configured preference is used: a bias value greater than or equal to 1 uses `RightSemi`/`RightAnti`/`RightMark`, while a value below 1 uses `LeftSemi`/`LeftAnti`/`LeftMark`. | | datafusion.optimizer.hash_join_inlist_pushdown_max_size | 131072 | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` \* `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins. | | datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values | 150 | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: | | datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | From 213349f4533bba7fdcc223bb19ec9cca0d71e0f0 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 15 Jun 2026 21:44:18 -0400 Subject: [PATCH 3/4] . --- .../physical-plan/src/joins/hash_join/exec.rs | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 7cddae276f5fa..c635a1598bc49 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -844,6 +844,13 @@ impl HashJoinExec { return false; } + // Dynamic filters are ordinary predicates over the probe-side join + // keys. They cannot preserve `NULL = NULL` join semantics: a probe-side + // NULL would be pruned even though it can match a build-side NULL. + if self.null_equality == NullEquality::NullEqualsNull { + return false; + } + // `preserve_file_partitions` can report Hash partitioning for Hive-style // file groups, but those partitions are not actually hash-distributed. // Partitioned dynamic filters rely on hash routing, so disable them in @@ -6410,6 +6417,35 @@ mod tests { Ok(()) } + #[test] + fn test_dynamic_filter_pushdown_rejects_null_equal_join() -> Result<()> { + let (_, _, on) = build_schema_and_on()?; + let left = build_table(("a1", &vec![1]), ("b1", &vec![1]), ("c1", &vec![1])); + let right = build_table(("a2", &vec![1]), ("b1", &vec![1]), ("c2", &vec![1])); + + let mut session_config = SessionConfig::default(); + session_config + .options_mut() + .optimizer + .enable_join_dynamic_filter_pushdown = true; + + let join = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::RightSemi, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNull, + false, + )?; + + assert!(!join.allow_join_dynamic_filter_pushdown(session_config.options())); + + Ok(()) + } + #[test] fn test_with_dynamic_filter_rejects_invalid_columns() -> Result<()> { let (_, _, on) = build_schema_and_on()?; From 6e04518be36ccb3da989a4efff773f869740c60d Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 16 Jun 2026 12:31:16 -0400 Subject: [PATCH 4/4] Revert whitespace fix --- .../physical-plan/src/joins/sort_merge_join/tests.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs b/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs index 3bf921e33a55f..0347299dd0094 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs @@ -2474,12 +2474,8 @@ async fn overallocation_multi_batch_spill() -> Result<()> { assert_eq!( runtime.memory_pool.reserved(), 0, - concat!( - "memory should be fully released after {join_type:?} completes ", - "(batch_size={batch_size}): infallible grow during restore must be balanced" - ), - join_type = join_type, - batch_size = batch_size, + "memory should be fully released after {join_type:?} completes + (batch_size={batch_size}): infallible grow during restore must be balanced" ); // Run the test with no spill configuration as let task_ctx_no_spill =