perf: Implement count pushdown for parquet (#5038)

desmondcheongzx · web-flow · commit 5c3cea58d9c4 · 2025-09-03T14:53:17.000-07:00
## Changes Made #4969 implemented count pushdowns for lance. We can do the same for parquet and read only parquet metadata. For example, I have a bucket in `s3://desmond-test/big.parquet/` with the following setup: - 161 parquet files - Each parquet file has `256000` rows of strings with 1024 random characters, giving a total size of 1.2GB per file - The footer len itself is only `2247` bytes Before, doing a count takes ~28s: ``` In [2]: %time daft.read_parquet("s3://desmond-test/big.parquet").count().show() ╭──────────╮ │ count │ │ --- │ │ UInt64 │ ╞══════════╡ │ 41216000 │ ╰──────────╯ (Showing first 1 of 1 rows) CPU times: user 1min 4s, sys: 1min 46s, total: 2min 50s Wall time: 28.2 s ``` After it takes ~4s: ``` In [2]: %time daft.read_parquet("s3://desmond-test/big.parquet").count().show() ╭──────────╮ │ count │ │ --- │ │ UInt64 │ ╞══════════╡ │ 41216000 │ ╰──────────╯ (Showing first 1 of 1 rows) CPU times: user 301 ms, sys: 94.2 ms, total: 395 ms Wall time: 3.89 s ```
diff --git a/src/daft-local-execution/src/sources/scan_task.rs b/src/daft-local-execution/src/sources/scan_task.rs
@@ -13,6 +13,7 @@ use common_runtime::{combine_stream, get_compute_pool_num_threads, get_io_runtim
 use common_scan_info::{Pushdowns, ScanTaskLike};
 use daft_core::prelude::{AsArrow, Int64Array, SchemaRef, Utf8Array};
 use daft_csv::{CsvConvertOptions, CsvParseOptions, CsvReadOptions};
+use daft_dsl::{AggExpr, Expr};
 use daft_io::IOStatsRef;
 use daft_json::{JsonConvertOptions, JsonParseOptions, JsonReadOptions};
 use daft_micropartition::MicroPartition;
@@ -470,36 +471,50 @@ async fn stream_scan_task(
             chunk_size: chunk_size_from_config,
             ..
         }) => {
-            let parquet_chunk_size = chunk_size_from_config.or(chunk_size);
-            let inference_options =
-                ParquetSchemaInferenceOptions::new(Some(*coerce_int96_timestamp_unit));
-
-            let delete_rows = delete_map.as_ref().and_then(|m| m.get(url).cloned());
-            let row_groups = if let Some(ChunkSpec::Parquet(row_groups)) = source.get_chunk_spec() {
-                Some(row_groups.clone())
+            if let Some(aggregation) = &scan_task.pushdowns.aggregation
+                && let Expr::Agg(AggExpr::Count(_, _)) = aggregation.as_ref()
+            {
+                daft_parquet::read::stream_parquet_count_pushdown(
+                    url,
+                    io_client,
+                    Some(io_stats),
+                    field_id_mapping.clone(),
+                    aggregation,
+                )
+                .await?
             } else {
-                None
-            };
-            let metadata = scan_task
-                .sources
-                .first()
-                .and_then(|s| s.get_parquet_metadata().cloned());
-            daft_parquet::read::stream_parquet(
-                url,
-                file_column_names.as_deref(),
-                scan_task.pushdowns.limit,
-                row_groups,
-                scan_task.pushdowns.filters.clone(),
-                io_client,
-                Some(io_stats),
-                &inference_options,
-                field_id_mapping.clone(),
-                metadata,
-                maintain_order,
-                delete_rows,
-                parquet_chunk_size,
-            )
-            .await?
+                let parquet_chunk_size = chunk_size_from_config.or(chunk_size);
+                let inference_options =
+                    ParquetSchemaInferenceOptions::new(Some(*coerce_int96_timestamp_unit));
+
+                let delete_rows = delete_map.as_ref().and_then(|m| m.get(url).cloned());
+                let row_groups =
+                    if let Some(ChunkSpec::Parquet(row_groups)) = source.get_chunk_spec() {
+                        Some(row_groups.clone())
+                    } else {
+                        None
+                    };
+                let metadata = scan_task
+                    .sources
+                    .first()
+                    .and_then(|s| s.get_parquet_metadata().cloned());
+                daft_parquet::read::stream_parquet(
+                    url,
+                    file_column_names.as_deref(),
+                    scan_task.pushdowns.limit,
+                    row_groups,
+                    scan_task.pushdowns.filters.clone(),
+                    io_client,
+                    Some(io_stats),
+                    &inference_options,
+                    field_id_mapping.clone(),
+                    metadata,
+                    maintain_order,
+                    delete_rows,
+                    parquet_chunk_size,
+                )
+                .await?
+            }
         }
         FileFormatConfig::Csv(cfg) => {
             let schema_of_file = scan_task.schema.clone();
diff --git a/src/daft-logical-plan/src/optimization/optimizer.rs b/src/daft-logical-plan/src/optimization/optimizer.rs
@@ -837,6 +837,7 @@ mod tests {
                 )))))
                 .with_columns(Some(Arc::new(vec!["a".to_string()]))),
         )
+        .aggregate(vec![unresolved_col("a").sum()], vec![])?
         .build();
 
         let scan_materializer_and_stats_enricher = get_scan_materializer_and_stats_enricher();
diff --git a/src/daft-logical-plan/src/optimization/rules/push_down_aggregation.rs b/src/daft-logical-plan/src/optimization/rules/push_down_aggregation.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use common_error::DaftResult;
+use common_error::{DaftError, DaftResult};
 use common_treenode::{Transformed, TreeNode};
 use daft_core::{count_mode::CountMode, prelude::Schema};
 use daft_dsl::{AggExpr, Expr, ExprRef};
@@ -65,7 +65,16 @@ impl OptimizerRule for PushDownAggregation {
                                             SourceInfo::Physical(new_external_info).into(),
                                         ))
                                         .into();
-                                        Ok(Transformed::yes(new_source))
+                                        // Scan operators may produce partial counts over multiple scan tasks (e.g., distributed parquet reads), so we still need to sum them.
+                                        let new_aggregate = Aggregate::try_new(
+                                            new_source,
+                                            vec![Arc::new(Expr::Agg(AggExpr::Sum(count_expr(
+                                                &aggregations[0],
+                                            )?)))],
+                                            groupby.clone(),
+                                        )?
+                                        .into();
+                                        Ok(Transformed::yes(new_aggregate))
                                     } else {
                                         Ok(Transformed::no(node.clone()))
                                     }
@@ -93,6 +102,15 @@ fn is_count_expr(expr: &ExprRef) -> Option<&CountMode> {
     }
 }
 
+fn count_expr(expr: &ExprRef) -> DaftResult<ExprRef> {
+    match expr.as_ref() {
+        Expr::Agg(AggExpr::Count(expr, _)) => Ok(expr.clone()),
+        _ => Err(DaftError::InternalError(
+            "Tried to get count expression from non-count expression".to_string(),
+        )),
+    }
+}
+
 // Check if the count mode is supported for pushdown
 // Currently only CountMode::All is fully supported
 fn is_count_mode_supported(count_mode: &CountMode) -> bool {
@@ -150,6 +168,7 @@ mod tests {
                 CountMode::All,
             ))))),
         )
+        .aggregate(vec![unresolved_col("a").sum()], vec![])?
         .build();
 
         assert_optimized_plan_eq(plan, expected)?;
diff --git a/src/daft-micropartition/src/micropartition.rs b/src/daft-micropartition/src/micropartition.rs
@@ -15,7 +15,7 @@ use common_runtime::get_io_runtime;
 use common_scan_info::Pushdowns;
 use daft_core::prelude::*;
 use daft_csv::{CsvConvertOptions, CsvParseOptions, CsvReadOptions};
-use daft_dsl::ExprRef;
+use daft_dsl::{AggExpr, Expr, ExprRef};
 use daft_io::{IOClient, IOConfig, IOStatsContext, IOStatsRef};
 use daft_json::{JsonConvertOptions, JsonParseOptions, JsonReadOptions};
 use daft_parquet::{
@@ -514,6 +514,11 @@ impl MicroPartition {
                     parquet_metadata,
                     chunk_size,
                     scan_task.generated_fields.clone(),
+                    scan_task
+                        .pushdowns
+                        .aggregation
+                        .as_ref()
+                        .map(|agg| agg.as_ref()),
                 )
                 .context(DaftCoreComputeSnafu)
             }
@@ -1106,6 +1111,7 @@ pub fn read_parquet_into_micropartition<T: AsRef<str>>(
     parquet_metadata: Option<Vec<Arc<FileMetaData>>>,
     chunk_size: Option<usize>,
     generated_fields: Option<SchemaRef>,
+    aggregation_pushdown: Option<&Expr>,
 ) -> DaftResult<MicroPartition> {
     if let Some(so) = start_offset
         && so > 0
@@ -1187,6 +1193,28 @@ pub fn read_parquet_into_micropartition<T: AsRef<str>>(
         (metadata, schemas)
     };
 
+    // Handle count pushdown aggregation optimization.
+    if let Some(Expr::Agg(AggExpr::Count(_, _))) = aggregation_pushdown {
+        let count: usize = metadata.iter().map(|m| m.num_rows).sum();
+        let count_field = daft_core::datatypes::Field::new(
+            aggregation_pushdown.unwrap().name(),
+            daft_core::datatypes::DataType::UInt64,
+        );
+        let count_array =
+            UInt64Array::from_iter(count_field.clone(), std::iter::once(Some(count as u64)));
+        let count_batch = daft_recordbatch::RecordBatch::new_with_size(
+            Schema::new(vec![count_field]),
+            vec![count_array.into_series()],
+            1,
+        )
+        .context(DaftCoreComputeSnafu)?;
+        return Ok(MicroPartition::new_loaded(
+            count_batch.schema.clone(),
+            Arc::new(vec![count_batch]),
+            None,
+        ));
+    }
+
     let any_stats_avail = metadata
         .iter()
         .flat_map(|m| m.row_groups.values())
diff --git a/src/daft-micropartition/src/python.rs b/src/daft-micropartition/src/python.rs
@@ -758,6 +758,7 @@ impl PyMicroPartition {
                 None,
                 None,
                 None,
+                None,
             )
         })?;
         Ok(mp.into())
@@ -819,6 +820,7 @@ impl PyMicroPartition {
                 None,
                 chunk_size,
                 None,
+                None,
             )
         })?;
         Ok(mp.into())
diff --git a/src/daft-parquet/src/read.rs b/src/daft-parquet/src/read.rs
@@ -1021,6 +1021,35 @@ pub async fn read_parquet_metadata_bulk(
     all_metadatas.into_iter().collect::<DaftResult<Vec<_>>>()
 }
 
+/// Optimized for count pushdowns: we can get the count from metadata without reading all data.
+pub async fn stream_parquet_count_pushdown(
+    url: &str,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+    field_id_mapping: Option<Arc<BTreeMap<i32, Field>>>,
+    aggregation: &ExprRef,
+) -> DaftResult<BoxStream<'static, DaftResult<RecordBatch>>> {
+    let parquet_metadata =
+        read_parquet_metadata(url, io_client, io_stats, field_id_mapping.clone()).await?;
+
+    // Currently only CountMode::All is supported for count pushdown.
+    let count = parquet_metadata.num_rows;
+    let count_field = daft_core::datatypes::Field::new(
+        aggregation.name(),
+        daft_core::datatypes::DataType::UInt64,
+    );
+    let count_array =
+        UInt64Array::from_iter(count_field.clone(), std::iter::once(Some(count as u64)));
+    let count_batch = daft_recordbatch::RecordBatch::new_with_size(
+        Schema::new(vec![count_field]),
+        vec![count_array.into_series()],
+        1,
+    )?;
+    Ok(Box::pin(futures::stream::once(
+        async move { Ok(count_batch) },
+    )))
+}
+
 pub fn read_parquet_statistics(
     uris: &Series,
     io_client: Arc<IOClient>,
diff --git a/src/daft-scan/src/glob.rs b/src/daft-scan/src/glob.rs
@@ -353,7 +353,7 @@ impl ScanOperator for GlobScanOperator {
     }
 
     fn supports_count_pushdown(&self) -> bool {
-        false
+        self.file_format_config.file_format() == FileFormat::Parquet
     }
 
     fn multiline_display(&self) -> Vec<String> {
diff --git a/src/daft-scan/src/lib.rs b/src/daft-scan/src/lib.rs
@@ -646,7 +646,15 @@ impl ScanTask {
     #[must_use]
     pub fn materialized_schema(&self) -> SchemaRef {
         match (&self.generated_fields, &self.pushdowns.columns) {
-            (None, None) => self.schema.clone(),
+            (None, None) => {
+                if let Some(aggregation) = &self.pushdowns.aggregation {
+                    Arc::new(Schema::new(vec![aggregation
+                        .to_field(&self.schema)
+                        .expect("Casting to aggregation field should not fail")]))
+                } else {
+                    self.schema.clone()
+                }
+            }
             _ => {
                 let schema_with_generated_fields =
                     if let Some(generated_fields) = &self.generated_fields {
@@ -657,10 +665,16 @@ impl ScanTask {
                     };
 
                 let mut fields = schema_with_generated_fields.fields().to_vec();
-
-                // Filter the schema based on the pushdown column filters.
-                if let Some(columns) = &self.pushdowns.columns {
-                    fields.retain(|field| columns.contains(&field.name));
+                if let Some(aggregation) = &self.pushdowns.aggregation {
+                    // If we have a pushdown aggregation, the only field in the schema is the aggregation.
+                    fields = vec![aggregation
+                        .to_field(&schema_with_generated_fields)
+                        .expect("Casting to aggregation field should not fail")];
+                } else {
+                    // Filter the schema based on the pushdown column filters.
+                    if let Some(columns) = &self.pushdowns.columns {
+                        fields.retain(|field| columns.contains(&field.name));
+                    }
                 }
 
                 Arc::new(Schema::new(fields))

Original file line number	Diff line number	Diff line change
`@@ -837,6 +837,7 @@ mod tests {`
`837`	`837`	`)))))`
`838`	`838`	`.with_columns(Some(Arc::new(vec!["a".to_string()]))),`
`839`	`839`	`)`
	`840`	`+ .aggregate(vec![unresolved_col("a").sum()], vec![])?`
`840`	`841`	`.build();`
`841`	`842`
`842`	`843`	`let scan_materializer_and_stats_enricher = get_scan_materializer_and_stats_enricher();`
Original file line number	Diff line number	Diff line change
`@@ -353,7 +353,7 @@ impl ScanOperator for GlobScanOperator {`
`353`	`353`	`}`
`354`	`354`
`355`	`355`	`fn supports_count_pushdown(&self) -> bool {`
`356`		`- false`
	`356`	`+ self.file_format_config.file_format() == FileFormat::Parquet`
`357`	`357`	`}`
`358`	`358`
`359`	`359`	`fn multiline_display(&self) -> Vec<String> {`