Eventual-Inc
diff --git a/‎Cargo.lock
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock
Lines changed: 1 addition & 0 deletions
diff --git a/‎daft/daft/__init__.pyi
Lines changed: 10 additions & 0 deletions b/‎daft/daft/__init__.pyi
Lines changed: 10 additions & 0 deletions
diff --git a/‎daft/io/lance/lance_scan.py
Lines changed: 78 additions & 4 deletions b/‎daft/io/lance/lance_scan.py
Lines changed: 78 additions & 4 deletions
diff --git a/‎daft/io/scan.py
Lines changed: 4 additions & 0 deletions b/‎daft/io/scan.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/common/scan-info/Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎src/common/scan-info/Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/common/scan-info/src/pushdowns.rs
Lines changed: 33 additions & 1 deletion b/‎src/common/scan-info/src/pushdowns.rs
Lines changed: 33 additions & 1 deletion
diff --git a/‎src/common/scan-info/src/python.rs
Lines changed: 31 additions & 1 deletion b/‎src/common/scan-info/src/python.rs
Lines changed: 31 additions & 1 deletion
diff --git a/‎src/common/scan-info/src/scan_operator.rs
Lines changed: 8 additions & 0 deletions b/‎src/common/scan-info/src/scan_operator.rs
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/common/scan-info/src/test/mod.rs
Lines changed: 5 additions & 0 deletions b/‎src/common/scan-info/src/test/mod.rs
Lines changed: 5 additions & 0 deletions
@@ -868,18 +868,28 @@ class PyPushdowns:
     filters: PyExpr | None
     partition_filters: PyExpr | None
     limit: int | None
+    aggregation: PyExpr | None
 
     def __init__(
         self,
         columns: list[str] | None = None,
         filters: PyExpr | None = None,
         partition_filters: PyExpr | None = None,
         limit: int | None = None,
+        aggregation: PyExpr | None = None,
     ) -> None: ...
     def filter_required_column_names(self) -> list[str]:
         """List of field names that are required by the filter predicate."""
         ...
 
+    def aggregation_required_column_names(self) -> list[str]:
+        """List of field names that are required by the aggregation predicate."""
+        ...
+
+    def aggregation_count_mode(self) -> CountMode:
+        """Count mode of the aggregation predicate."""
+        ...
+
 PyArrowParquetType = tuple[pa.Field, dict[str, str], pa.Array, int]
 
 def read_parquet(
 
@@ -1,10 +1,12 @@
 # ruff: noqa: I002
 # isort: dont-add-import: from __future__ import annotations
 
+import logging
 from collections.abc import Iterator
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
-from daft.daft import PyExpr, PyPartitionField, PyPushdowns, PyRecordBatch, ScanTask
+from daft.daft import CountMode, PyExpr, PyPartitionField, PyPushdowns, PyRecordBatch, ScanTask
+from daft.dependencies import pa
 from daft.io.scan import ScanOperator
 from daft.logical.schema import Schema
 from daft.recordbatch import RecordBatch
@@ -13,15 +15,16 @@
 
 if TYPE_CHECKING:
     import lance
-    import pyarrow
+
+logger = logging.getLogger(__name__)
 
 
 # TODO support fts and fast_search
 def _lancedb_table_factory_function(
     ds: "lance.LanceDataset",
     fragment_ids: Optional[list[int]] = None,
     required_columns: Optional[list[str]] = None,
-    filter: Optional["pyarrow.compute.Expression"] = None,
+    filter: Optional["pa.compute.Expression"] = None,
     limit: Optional[int] = None,
 ) -> Iterator[PyRecordBatch]:
     fragments = [ds.get_fragment(id) for id in (fragment_ids or [])]
@@ -31,6 +34,30 @@ def _lancedb_table_factory_function(
     return (RecordBatch.from_arrow_record_batches([rb], rb.schema)._recordbatch for rb in scanner.to_batches())
 
 
+def _lancedb_count_result_function(
+    ds: "lance.LanceDataset",
+    required_column: str,
+    filters: Optional[list[Any]] = None,
+) -> Iterator[PyRecordBatch]:
+    """Use LanceDB's API to count rows and return a record batch with the count result."""
+    count = 0
+    if filters is None:
+        logger.debug("Using metadata for counting all rows (no filters)")
+        count = ds.count_rows()
+    else:
+        # TODO: If filters are provided, we need to apply them after counting
+        logger.debug("Counting rows with filters applied")
+        scanner = ds.scanner(filter=filters)
+        for batch in scanner.to_batches():
+            count += batch.num_rows
+
+    arrow_schema = pa.schema([pa.field(required_column, pa.uint64())])
+    arrow_array = pa.array([count], type=pa.uint64())
+    arrow_batch = pa.RecordBatch.from_arrays([arrow_array], [required_column])
+    result_batch = RecordBatch.from_arrow_record_batches([arrow_batch], arrow_schema)._recordbatch
+    return (result_batch for _ in [1])
+
+
 class LanceDBScanOperator(ScanOperator, SupportsPushdownFilters):
     def __init__(self, ds: "lance.LanceDataset"):
         self._ds = ds
@@ -57,6 +84,14 @@ def can_absorb_limit(self) -> bool:
     def can_absorb_select(self) -> bool:
         return False
 
+    def supports_count_pushdown(self) -> bool:
+        """Returns whether this scan operator supports count pushdown."""
+        return True
+
+    def supported_count_modes(self) -> list[CountMode]:
+        """Returns the count modes supported by this scan operator."""
+        return [CountMode.All]
+
     def multiline_display(self) -> list[str]:
         return [
             self.display_name(),
@@ -95,6 +130,45 @@ def to_scan_tasks(self, pushdowns: PyPushdowns) -> Iterator[ScanTask]:
                     else pushdowns.columns + filter_required_column_names
                 )
             )
+
+        # Check if there is a count aggregation pushdown
+        if (
+            pushdowns.aggregation is not None
+            and pushdowns.aggregation_count_mode() is not None
+            and pushdowns.aggregation_required_column_names()
+        ):
+            count_mode = pushdowns.aggregation_count_mode()
+            fields = pushdowns.aggregation_required_column_names()
+
+            if count_mode not in self.supported_count_modes():
+                logger.warning(
+                    "Count mode %s is not supported for pushdown, falling back to original logic",
+                    count_mode,
+                )
+                yield from self._create_regular_scan_tasks(pushdowns, required_columns)
+
+            # TODO: If there are pushed filters, convert them to Arrow expressions
+            filters = None
+
+            new_schema = Schema.from_pyarrow_schema(pa.schema([pa.field(fields[0], pa.uint64())]))
+            yield ScanTask.python_factory_func_scan_task(
+                module=_lancedb_count_result_function.__module__,
+                func_name=_lancedb_count_result_function.__name__,
+                func_args=(self._ds, fields[0], filters),
+                schema=new_schema._schema,
+                num_rows=1,
+                size_bytes=None,
+                pushdowns=pushdowns,
+                stats=None,
+            )
+        else:
+            # Regular scan without count pushdown
+            yield from self._create_regular_scan_tasks(pushdowns, required_columns)
+
+    def _create_regular_scan_tasks(
+        self, pushdowns: PyPushdowns, required_columns: Optional[list[str]]
+    ) -> Iterator[ScanTask]:
+        """Create regular scan tasks without count pushdown."""
         # TODO: figure out how to translate Pushdowns into LanceDB filters
         filters = None
         fragments = self._ds.get_fragments()
 
@@ -79,3 +79,7 @@ def to_scan_tasks(self, pushdowns: PyPushdowns) -> Iterator[ScanTask]:
     def as_pushdown_filter(self) -> SupportsPushdownFilters | None:
         """Returns this scan operator as a SupportsPushdownFilters if it supports pushdown filters."""
         raise NotImplementedError()
+
+    def supports_count_pushdown(self) -> bool:
+        """Returns true if this scan can accept count pushdowns."""
+        return False
@@ -4,6 +4,7 @@ common-display = {path = "../display", default-features = false}
 common-error = {path = "../error", default-features = false}
 common-file-formats = {path = "../file-formats", default-features = false}
 daft-algebra = {path = "../../daft-algebra", default-features = false}
+daft-core = {path = "../../daft-core", default-features = false}
 daft-dsl = {path = "../../daft-dsl", default-features = false}
 daft-schema = {path = "../../daft-schema", default-features = false}
 fnv = "1.0.7"
 
@@ -28,11 +28,16 @@ pub struct Pushdowns {
     /// The `filters` field is kept for backward compatibility;
     /// it represents all current filters.
     pub pushed_filters: Option<Vec<ExprRef>>,
+
+    // /// Optional aggregation pushdown.
+    /// This is used to indicate that the scan operator can perform an aggregation.
+    /// This is useful for scans that can perform aggregations like `count`
+    pub aggregation: Option<ExprRef>,
 }
 
 impl Default for Pushdowns {
     fn default() -> Self {
-        Self::new(None, None, None, None, None)
+        Self::new(None, None, None, None, None, None)
     }
 }
 
@@ -44,6 +49,7 @@ impl Pushdowns {
         columns: Option<Arc<Vec<String>>>,
         limit: Option<usize>,
         sharder: Option<Sharder>,
+        aggregation: Option<ExprRef>,
     ) -> Self {
         Self {
             filters,
@@ -52,6 +58,7 @@ impl Pushdowns {
             limit,
             sharder,
             pushed_filters: None,
+            aggregation,
         }
     }
 
@@ -72,6 +79,7 @@ impl Pushdowns {
             limit,
             sharder: self.sharder.clone(),
             pushed_filters: self.pushed_filters.clone(),
+            aggregation: self.aggregation.clone(),
         }
     }
 
@@ -84,6 +92,7 @@ impl Pushdowns {
             limit: self.limit,
             sharder: self.sharder.clone(),
             pushed_filters: self.pushed_filters.clone(),
+            aggregation: self.aggregation.clone(),
         }
     }
 
@@ -96,6 +105,7 @@ impl Pushdowns {
             limit: self.limit,
             sharder: self.sharder.clone(),
             pushed_filters: self.pushed_filters.clone(),
+            aggregation: self.aggregation.clone(),
         }
     }
 
@@ -108,6 +118,7 @@ impl Pushdowns {
             limit: self.limit,
             sharder: self.sharder.clone(),
             pushed_filters: self.pushed_filters.clone(),
+            aggregation: self.aggregation.clone(),
         }
     }
 
@@ -120,6 +131,7 @@ impl Pushdowns {
             limit: self.limit,
             sharder,
             pushed_filters: self.pushed_filters.clone(),
+            aggregation: self.aggregation.clone(),
         }
     }
 
@@ -132,6 +144,20 @@ impl Pushdowns {
             limit: self.limit,
             sharder: self.sharder.clone(),
             pushed_filters,
+            aggregation: self.aggregation.clone(),
+        }
+    }
+
+    #[must_use]
+    pub fn with_aggregation(&self, aggregation: Option<ExprRef>) -> Self {
+        Self {
+            filters: self.filters.clone(),
+            partition_filters: self.partition_filters.clone(),
+            columns: self.columns.clone(),
+            limit: self.limit,
+            sharder: self.sharder.clone(),
+            pushed_filters: self.pushed_filters.clone(),
+            aggregation,
         }
     }
 
@@ -153,6 +179,9 @@ impl Pushdowns {
         if let Some(sharder) = &self.sharder {
             res.push(format!("Sharder = {sharder}"));
         }
+        if let Some(aggregation) = &self.aggregation {
+            res.push(format!("Aggregation pushdown = {aggregation}"));
+        }
         res
     }
 
@@ -187,6 +216,9 @@ impl DisplayAs for Pushdowns {
                 if let Some(sharder) = &self.sharder {
                     sub_items.push(format!("sharder: {sharder}"));
                 }
+                if let Some(aggregation) = &self.aggregation {
+                    sub_items.push(format!("aggregation: {aggregation}"));
+                }
                 s.push_str(&sub_items.join(", "));
                 s.push('}');
                 s
 
@@ -3,7 +3,8 @@ use pyo3::prelude::*;
 pub mod pylib {
     use std::sync::Arc;
 
-    use daft_dsl::python::PyExpr;
+    use daft_core::count_mode::CountMode;
+    use daft_dsl::{python::PyExpr, AggExpr, Expr};
     use daft_schema::python::field::PyField;
     use pyo3::{exceptions::PyAttributeError, prelude::*, pyclass};
     use serde::{Deserialize, Serialize};
@@ -164,19 +165,22 @@ pub mod pylib {
             partition_filters = None,
             columns = None,
             limit = None,
+            aggregation = None,
         ))]
         pub fn new(
             filters: Option<PyExpr>,
             partition_filters: Option<PyExpr>,
             columns: Option<Vec<String>>,
             limit: Option<usize>,
+            aggregation: Option<PyExpr>,
         ) -> Self {
             let pushdowns = Pushdowns::new(
                 filters.map(|f| f.expr),
                 partition_filters.map(|f| f.expr),
                 columns.map(Arc::new),
                 limit,
                 None,
+                aggregation.map(|f| f.expr),
             );
             Self(Arc::new(pushdowns))
         }
@@ -212,12 +216,38 @@ pub mod pylib {
             self.0.columns.as_deref().cloned()
         }
 
+        #[getter]
+        #[must_use]
+        pub fn aggregation(&self) -> Option<PyExpr> {
+            self.0
+                .aggregation
+                .as_ref()
+                .map(|e| PyExpr { expr: e.clone() })
+        }
+
         pub fn filter_required_column_names(&self) -> Option<Vec<String>> {
             self.0
                 .filters
                 .as_ref()
                 .map(daft_dsl::optimization::get_required_columns)
         }
+
+        pub fn aggregation_required_column_names(&self) -> Option<Vec<String>> {
+            self.0
+                .aggregation
+                .as_ref()
+                .map(daft_dsl::optimization::get_required_columns)
+        }
+
+        pub fn aggregation_count_mode(&self) -> Option<CountMode> {
+            match self.0.aggregation.as_ref() {
+                Some(expr) => match expr.as_ref() {
+                    Expr::Agg(AggExpr::Count(_, count_mode)) => Some(*count_mode),
+                    _ => None,
+                },
+                None => None,
+            }
+        }
     }
 }
 
 
@@ -33,6 +33,14 @@ pub trait ScanOperator: Send + Sync + Debug {
     fn can_absorb_shard(&self) -> bool;
     fn multiline_display(&self) -> Vec<String>;
 
+    fn supports_count_pushdown(&self) -> bool {
+        false
+    }
+
+    fn supported_count_modes(&self) -> Vec<daft_core::count_mode::CountMode> {
+        Vec::new()
+    }
+
     /// If cfg provided, `to_scan_tasks` should apply the appropriate transformations
     /// (merging, splitting) to the outputted scan tasks
     fn to_scan_tasks(&self, pushdowns: Pushdowns) -> DaftResult<Vec<ScanTaskLikeRef>>;
 
@@ -29,6 +29,7 @@ pub struct DummyScanOperator {
     pub schema: SchemaRef,
     pub num_scan_tasks: u32,
     pub num_rows_per_task: Option<usize>,
+    pub supports_count_pushdown_flag: bool,
 }
 
 #[typetag::serde]
@@ -145,6 +146,10 @@ impl ScanOperator for DummyScanOperator {
         vec!["DummyScanOperator".to_string()]
     }
 
+    fn supports_count_pushdown(&self) -> bool {
+        self.supports_count_pushdown_flag
+    }
+
     fn to_scan_tasks(&self, pushdowns: Pushdowns) -> DaftResult<Vec<ScanTaskLikeRef>> {
         Ok((0..self.num_scan_tasks)
             .map(|i| {
Original file line number	Diff line number	Diff line change
`@@ -28,11 +28,16 @@ pub struct Pushdowns {`
`28`	`28`	/// The `filters` field is kept for backward compatibility;
`29`	`29`	`/// it represents all current filters.`
`30`	`30`	`pub pushed_filters: Option<Vec<ExprRef>>,`
	`31`	`+`
	`32`	`+ // /// Optional aggregation pushdown.`
	`33`	`+ /// This is used to indicate that the scan operator can perform an aggregation.`
	`34`	+ /// This is useful for scans that can perform aggregations like `count`
	`35`	`+ pub aggregation: Option<ExprRef>,`
`31`	`36`	`}`
`32`	`37`
`33`	`38`	`impl Default for Pushdowns {`
`34`	`39`	`fn default() -> Self {`
`35`		`- Self::new(None, None, None, None, None)`
	`40`	`+ Self::new(None, None, None, None, None, None)`
`36`	`41`	`}`
`37`	`42`	`}`
`38`	`43`
`@@ -44,6 +49,7 @@ impl Pushdowns {`
`44`	`49`	`columns: Option<Arc<Vec<String>>>,`
`45`	`50`	`limit: Option<usize>,`
`46`	`51`	`sharder: Option<Sharder>,`
	`52`	`+ aggregation: Option<ExprRef>,`
`47`	`53`	`) -> Self {`
`48`	`54`	`Self {`
`49`	`55`	`filters,`
`@@ -52,6 +58,7 @@ impl Pushdowns {`
`52`	`58`	`limit,`
`53`	`59`	`sharder,`
`54`	`60`	`pushed_filters: None,`
	`61`	`+ aggregation,`
`55`	`62`	`}`
`56`	`63`	`}`
`57`	`64`
`@@ -72,6 +79,7 @@ impl Pushdowns {`
`72`	`79`	`limit,`
`73`	`80`	`sharder: self.sharder.clone(),`
`74`	`81`	`pushed_filters: self.pushed_filters.clone(),`
	`82`	`+ aggregation: self.aggregation.clone(),`
`75`	`83`	`}`
`76`	`84`	`}`
`77`	`85`
`@@ -84,6 +92,7 @@ impl Pushdowns {`
`84`	`92`	`limit: self.limit,`
`85`	`93`	`sharder: self.sharder.clone(),`
`86`	`94`	`pushed_filters: self.pushed_filters.clone(),`
	`95`	`+ aggregation: self.aggregation.clone(),`
`87`	`96`	`}`
`88`	`97`	`}`
`89`	`98`
`@@ -96,6 +105,7 @@ impl Pushdowns {`
`96`	`105`	`limit: self.limit,`
`97`	`106`	`sharder: self.sharder.clone(),`
`98`	`107`	`pushed_filters: self.pushed_filters.clone(),`
	`108`	`+ aggregation: self.aggregation.clone(),`
`99`	`109`	`}`
`100`	`110`	`}`
`101`	`111`
`@@ -108,6 +118,7 @@ impl Pushdowns {`
`108`	`118`	`limit: self.limit,`
`109`	`119`	`sharder: self.sharder.clone(),`
`110`	`120`	`pushed_filters: self.pushed_filters.clone(),`
	`121`	`+ aggregation: self.aggregation.clone(),`
`111`	`122`	`}`
`112`	`123`	`}`
`113`	`124`
`@@ -120,6 +131,7 @@ impl Pushdowns {`
`120`	`131`	`limit: self.limit,`
`121`	`132`	`sharder,`
`122`	`133`	`pushed_filters: self.pushed_filters.clone(),`
	`134`	`+ aggregation: self.aggregation.clone(),`
`123`	`135`	`}`
`124`	`136`	`}`
`125`	`137`
`@@ -132,6 +144,20 @@ impl Pushdowns {`
`132`	`144`	`limit: self.limit,`
`133`	`145`	`sharder: self.sharder.clone(),`
`134`	`146`	`pushed_filters,`
	`147`	`+ aggregation: self.aggregation.clone(),`
	`148`	`+ }`
	`149`	`+ }`
	`150`	`+`
	`151`	`+ #[must_use]`
	`152`	`+ pub fn with_aggregation(&self, aggregation: Option<ExprRef>) -> Self {`
	`153`	`+ Self {`
	`154`	`+ filters: self.filters.clone(),`
	`155`	`+ partition_filters: self.partition_filters.clone(),`
	`156`	`+ columns: self.columns.clone(),`
	`157`	`+ limit: self.limit,`
	`158`	`+ sharder: self.sharder.clone(),`
	`159`	`+ pushed_filters: self.pushed_filters.clone(),`
	`160`	`+ aggregation,`
`135`	`161`	`}`
`136`	`162`	`}`
`137`	`163`
`@@ -153,6 +179,9 @@ impl Pushdowns {`
`153`	`179`	`if let Some(sharder) = &self.sharder {`
`154`	`180`	`res.push(format!("Sharder = {sharder}"));`
`155`	`181`	`}`
	`182`	`+ if let Some(aggregation) = &self.aggregation {`
	`183`	`+ res.push(format!("Aggregation pushdown = {aggregation}"));`
	`184`	`+ }`
`156`	`185`	`res`
`157`	`186`	`}`
`158`	`187`
`@@ -187,6 +216,9 @@ impl DisplayAs for Pushdowns {`
`187`	`216`	`if let Some(sharder) = &self.sharder {`
`188`	`217`	`sub_items.push(format!("sharder: {sharder}"));`
`189`	`218`	`}`
	`219`	`+ if let Some(aggregation) = &self.aggregation {`
	`220`	`+ sub_items.push(format!("aggregation: {aggregation}"));`
	`221`	`+ }`
`190`	`222`	`s.push_str(&sub_items.join(", "));`
`191`	`223`	`s.push('}');`
`192`	`224`	`s`