Read metadata to get count_rows

huleilei · huleilei · commit 0b511b1bdd04 · 2025-09-06T11:05:19.000+08:00
diff --git a/daft/io/iceberg/iceberg_scan.py b/daft/io/iceberg/iceberg_scan.py
@@ -39,25 +39,15 @@
 
 
 def _iceberg_count_result_function(total_count: int, field_name: str) -> Iterator[PyRecordBatch]:
-    """Construct Iceberg count query result.
-
-    This function creates a single-row result containing the count value,
-    which is used by the count pushdown optimization.
-    """
+    """Construct Iceberg count query result."""
     try:
-        # Create Arrow schema and array for the count result
         arrow_schema = pa.schema([pa.field(field_name, pa.uint64())])
         arrow_array = pa.array([total_count], type=pa.uint64())
         arrow_batch = pa.RecordBatch.from_arrays([arrow_array], [field_name])
 
-        # Convert to Daft RecordBatch
-        result_batch = RecordBatch.from_arrow_record_batches([arrow_batch], arrow_schema)._recordbatch
-
         logger.debug("Generated Iceberg count result: %s=%d", field_name, total_count)
 
-        # Yield the result batch (generator pattern)
-        yield result_batch
-
+        yield RecordBatch.from_arrow_record_batches([arrow_batch], arrow_schema)._recordbatch
     except Exception as e:
         logger.error("Failed to construct Iceberg count result: %s", e)
         raise
@@ -266,26 +256,25 @@ def _create_regular_scan_tasks(self, pushdowns: PyPushdowns) -> Iterator[ScanTas
         return iter(scan_tasks)
 
     def _create_count_scan_task(self, pushdowns: PyPushdowns, field_name: str) -> Iterator[ScanTask]:
-        """Create count pushdown scan task using Iceberg metadata.
-
-        This method leverages Iceberg's manifest files to calculate the total row count
-        without reading the actual data files, providing significant performance improvements.
-        """
+        """Create count pushdown scan task using Iceberg metadata."""
         try:
-            # Calculate total count from Iceberg metadata
-            total_count = self._calculate_total_rows_from_metadata()
+            from pyiceberg.table.snapshots import TOTAL_RECORDS
 
-            # Create result schema with the count field
+            if self._snapshot_id is None:
+                snapshot = self._table.current_snapshot()
+            else:
+                snapshot = self._table.snapshot_by_id(self._snapshot_id)
+
+            total_count = int(snapshot.summary.get(TOTAL_RECORDS, 0))
             result_schema = Schema.from_pyarrow_schema(pa.schema([pa.field(field_name, pa.uint64())]))
 
-            # Create Python factory function scan task
             scan_task = ScanTask.python_factory_func_scan_task(
                 module=_iceberg_count_result_function.__module__,
                 func_name=_iceberg_count_result_function.__name__,
                 func_args=(total_count, field_name),
                 schema=result_schema._schema,
-                num_rows=1,  # Count result is always a single row
-                size_bytes=8,  # uint64 size
+                num_rows=1,
+                size_bytes=8,
                 pushdowns=pushdowns,
                 stats=None,
             )
@@ -294,54 +283,9 @@ def _create_count_scan_task(self, pushdowns: PyPushdowns, field_name: str) -> It
             yield scan_task
 
         except Exception as e:
-            logger.error("Failed to create Iceberg count pushdown task: %s", e)
-            # Fallback to regular scan if count pushdown fails
-            logger.warning("Falling back to regular scan due to count pushdown failure")
+            logger.error("Failed to create Iceberg count pushdown task: %s, now falling back to regular scan", e)
             yield from self._create_regular_scan_tasks(pushdowns)
 
-    def _calculate_total_rows_from_metadata(self) -> int:
-        """Calculate total row count from Iceberg manifest metadata.
-
-        This method reads the manifest files to aggregate record_count information
-        from all data files without accessing the actual data.
-        """
-        try:
-            # Get scan plan from Iceberg table
-            iceberg_tasks = self._table.scan(
-                limit=None,  # No limit for count calculation
-                snapshot_id=self._snapshot_id,
-            ).plan_files()
-
-            total_rows = 0
-            total_deleted = 0
-
-            # Aggregate row counts from all data files
-            for task in iceberg_tasks:
-                data_file = task.file
-                total_rows += data_file.record_count
-
-                # Handle delete files (for Iceberg MOR - Merge-on-Read)
-                for delete_file in task.delete_files:
-                    # For now, we'll use a simple estimation for delete files
-                    # In a production implementation, this could be more sophisticated
-                    total_deleted += delete_file.record_count
-
-            # Calculate final count (ensure non-negative)
-            final_count = max(0, total_rows - total_deleted)
-
-            logger.info(
-                "Calculated Iceberg count from metadata: total_rows=%d, deleted_rows=%d, final_count=%d",
-                total_rows,
-                total_deleted,
-                final_count,
-            )
-
-            return final_count
-
-        except Exception as e:
-            logger.error("Failed to calculate total rows from Iceberg metadata: %s", e)
-            raise
-
     def can_absorb_filter(self) -> bool:
         return False
 
@@ -352,16 +296,7 @@ def can_absorb_select(self) -> bool:
         return True
 
     def supports_count_pushdown(self) -> bool:
-        """Returns whether this scan operator supports count pushdown.
-
-        Iceberg supports count pushdown by leveraging metadata stored in manifest files.
-        Each data file's record_count is available without reading the actual data.
-        """
         return True
 
     def supported_count_modes(self) -> list[CountMode]:
-        """Returns the count modes supported by this scan operator.
-
-        Currently only supports COUNT(*) which corresponds to CountMode.All.
-        """
         return [CountMode.All]
diff --git a/tests/integration/iceberg/docker-compose/provision.py b/tests/integration/iceberg/docker-compose/provision.py
@@ -427,3 +427,143 @@
 """)
 
 spark.sql("INSERT INTO default.test_snapshotting VALUES (4, 1)")
+
+
+###
+# MOR (Merge-on-Read) Complex Scenario Test Table
+# Used to test the accuracy of Count push down function in complex delete file scenarios
+###
+
+spark.sql(
+    """
+    CREATE OR REPLACE TABLE default.test_overlapping_deletes (
+        id integer,
+        name string,
+        value double,
+        category string
+    )
+    USING iceberg
+    TBLPROPERTIES (
+        'write.delete.mode'='merge-on-read',
+        'write.update.mode'='merge-on-read',
+        'write.merge.mode'='merge-on-read',
+        'format-version'='2'
+    );
+"""
+)
+
+spark.sql(
+    """
+    INSERT INTO default.test_overlapping_deletes
+    VALUES
+        (1, 'Alice', 100.0, 'A'),
+        (2, 'Bob', 200.0, 'B'),
+        (3, 'Charlie', 300.0, 'A'),
+        (4, 'David', 400.0, 'B'),
+        (5, 'Eve', 500.0, 'A'),
+        (6, 'Frank', 600.0, 'B'),
+        (7, 'Grace', 700.0, 'A'),
+        (8, 'Henry', 800.0, 'B'),
+        (9, 'Ivy', 900.0, 'A'),
+        (10, 'Jack', 1000.0, 'B'),
+        (11, 'Kate', 1100.0, 'A'),
+        (12, 'Leo', 1200.0, 'B'),
+        (13, 'Mary', 1300.0, 'A'),
+        (14, 'Nick', 1400.0, 'B'),
+        (15, 'Olivia', 1500.0, 'A');
+"""
+)
+
+spark.sql(
+    """
+    DELETE FROM default.test_overlapping_deletes WHERE id <= 5
+"""
+)
+
+spark.sql(
+    """
+    DELETE FROM default.test_overlapping_deletes WHERE id <= 3
+"""
+)
+
+spark.sql(
+    """
+    DELETE FROM default.test_overlapping_deletes WHERE id >= 4 AND id <= 8
+"""
+)
+
+# Mixed Delete Type Test Table - Testing the Mixed Processing of Position Delete and Equality Delete
+
+spark.sql(
+    """
+    CREATE OR REPLACE TABLE default.test_mixed_delete_types (
+        id integer,
+        name string,
+        age integer,
+        department string,
+        salary double,
+        active boolean
+    )
+    USING iceberg
+    TBLPROPERTIES (
+        'write.delete.mode'='merge-on-read',
+        'write.update.mode'='merge-on-read',
+        'write.merge.mode'='merge-on-read',
+        'format-version'='2'
+    );
+"""
+)
+
+spark.sql(
+    """
+    INSERT INTO default.test_mixed_delete_types
+    VALUES
+        (1, 'Alice', 25, 'Engineering', 75000.0, true),
+        (2, 'Bob', 30, 'Marketing', 65000.0, true),
+        (3, 'Charlie', 35, 'Engineering', 85000.0, true),
+        (4, 'David', 28, 'Sales', 60000.0, false),
+        (5, 'Eve', 32, 'Engineering', 90000.0, true),
+        (6, 'Frank', 45, 'Marketing', 70000.0, true),
+        (7, 'Grace', 29, 'Engineering', 80000.0, true),
+        (8, 'Henry', 38, 'Sales', 55000.0, false),
+        (9, 'Ivy', 26, 'Engineering', 78000.0, true),
+        (10, 'Jack', 33, 'Marketing', 68000.0, true),
+        (11, 'Kate', 31, 'Engineering', 82000.0, true),
+        (12, 'Leo', 27, 'Sales', 58000.0, true),
+        (13, 'Mary', 34, 'Engineering', 88000.0, true),
+        (14, 'Nick', 29, 'Marketing', 66000.0, false),
+        (15, 'Olivia', 36, 'Engineering', 92000.0, true),
+        (16, 'Paul', 40, 'Sales', 62000.0, true),
+        (17, 'Quinn', 28, 'Engineering', 76000.0, true),
+        (18, 'Rachel', 32, 'Marketing', 69000.0, true),
+        (19, 'Steve', 37, 'Engineering', 87000.0, true),
+        (20, 'Tina', 30, 'Sales', 61000.0, false);
+"""
+)
+
+spark.sql(
+    """
+    DELETE FROM default.test_mixed_delete_types WHERE id IN (2, 5, 8, 11, 14)
+"""
+)
+
+spark.sql(
+    """
+    DELETE FROM default.test_mixed_delete_types WHERE department = 'Sales' AND active = false
+"""
+)
+
+spark.sql(
+    """
+    DELETE FROM default.test_mixed_delete_types WHERE age < 30 AND salary < 70000
+"""
+)
+
+spark.sql(
+    """
+    INSERT INTO default.test_mixed_delete_types
+    VALUES
+        (2, 'Lily', 60, 'Sales', 2000.0, true),
+        (21, 'Lucy', 28, 'Engineering', 76000.0, true);
+"""
+)
diff --git a/tests/integration/iceberg/test_iceberg_reads.py b/tests/integration/iceberg/test_iceberg_reads.py
@@ -45,6 +45,8 @@ def test_daft_iceberg_table_open(local_iceberg_tables, local_iceberg_catalog):
     "test_add_new_column",
     "test_new_column_with_no_data",
     "test_table_rename",
+    "test_overlapping_deletes",
+    "test_mixed_delete_types",
     # Partition evolution currently not supported, see issue: https://github.com/Eventual-Inc/Daft/issues/2249
     # "test_evolve_partitioning",
 ]
@@ -69,6 +71,8 @@ def test_daft_iceberg_table_open(local_iceberg_tables, local_iceberg_catalog):
     "test_add_new_column": ["idx"],
     "test_new_column_with_no_data": [],
     "test_table_rename": [],
+    "test_overlapping_deletes": [],
+    "test_mixed_delete_types": [],
 }
 
 
@@ -240,62 +244,12 @@ def test_daft_iceberg_table_mor_predicate_collect_correct(table_name, local_iceb
 class TestIcebergCountPushdown:
     """Test suite for Iceberg Count pushdown optimization."""
 
-    @pytest.mark.integration()
-    def test_count_pushdown_basic(self, local_iceberg_catalog, capsys):
-        """Test basic count(*) pushdown functionality."""
-        catalog_name, pyiceberg_catalog = local_iceberg_catalog
-        tab = pyiceberg_catalog.load_table("default.test_all_types")
-
-        # Test Daft count with pushdown
-        df = daft.read_table(f"{catalog_name}.default.test_all_types").count()
-        _ = capsys.readouterr()
-        df.explain(True)
-        actual = capsys.readouterr()
-        assert "daft.io.iceberg.iceberg_scan:_iceberg_count_result_function" in actual.out
-
-        daft_count = df.collect().to_pydict()["count"][0]
-
-        # Compare with PyIceberg count
-        iceberg_count = len(tab.scan().to_arrow())
-
-        assert daft_count == iceberg_count
-
-    @pytest.mark.integration()
-    def test_count_pushdown_empty_table(self, local_iceberg_catalog, capsys):
-        """Test count pushdown on empty table."""
-        catalog_name, pyiceberg_catalog = local_iceberg_catalog
-
-        # Use a table that might be empty or create logic to test empty scenario
-        try:
-            tab = pyiceberg_catalog.load_table("default.test_new_column_with_no_data")
-            df = daft.read_table(f"{catalog_name}.default.test_new_column_with_no_data").count()
-
-            _ = capsys.readouterr()
-            df.explain(True)
-            actual = capsys.readouterr()
-            assert "daft.io.iceberg.iceberg_scan:_iceberg_count_result_function" in actual.out
-
-            daft_count = df.collect().to_pydict()["count"][0]
-
-            # Compare with PyIceberg count
-            iceberg_count = len(tab.scan().to_arrow())
-
-            assert daft_count == iceberg_count
-        except Exception:
-            # If table doesn't exist or has issues, skip this test
-            pytest.skip("Empty table test requires specific table setup")
-
     @pytest.mark.integration()
     @pytest.mark.parametrize(
         "table_name",
-        [
-            "test_partitioned_by_identity",
-            "test_partitioned_by_bucket",
-            "test_partitioned_by_days",
-            "test_partitioned_by_years",
-        ],
+        WORKING_SHOW_COLLECT,
     )
-    def test_count_pushdown_partitioned_tables(self, table_name, local_iceberg_catalog, capsys):
+    def test_count_pushdown_basic(self, table_name, local_iceberg_catalog, capsys):
         """Test count pushdown on partitioned tables."""
         catalog_name, pyiceberg_catalog = local_iceberg_catalog
         tab = pyiceberg_catalog.load_table(f"default.{table_name}")
@@ -345,7 +299,7 @@ def test_count_pushdown_with_column_selection(self, local_iceberg_catalog, capsy
 
         # Test count with column selection (should still use pushdown)
         df = daft.read_table(f"{catalog_name}.default.test_all_types")
-        df = df.select("id") if "id" in df.column_names else df.select(df.column_names[0]).count()
+        df = df.select("id").count() if "id" in df.column_names else df.select(df.column_names[0]).count()
 
         _ = capsys.readouterr()
         df.explain(True)
@@ -407,25 +361,3 @@ def test_count_pushdown_snapshot_consistency(self, local_iceberg_catalog, capsys
         except Exception:
             # If snapshotting table doesn't exist, skip this test
             pytest.skip("Snapshot test requires test_snapshotting table")
-
-    @pytest.mark.integration()
-    @pytest.mark.parametrize("table_name", ["test_positional_mor_deletes", "test_positional_mor_double_deletes"])
-    def test_count_pushdown_with_deletes(self, table_name, local_iceberg_catalog, capsys):
-        """Test count pushdown on tables with MOR (Merge-On-Read) deletes."""
-        catalog_name, pyiceberg_catalog = local_iceberg_catalog
-        tab = pyiceberg_catalog.load_table(f"default.{table_name}")
-
-        # Test Daft count on table with deletes
-        df = daft.read_table(f"{catalog_name}.default.{table_name}").count()
-
-        _ = capsys.readouterr()
-        df.explain(True)
-        actual = capsys.readouterr()
-        assert "daft.io.iceberg.iceberg_scan:_iceberg_count_result_function" in actual.out
-
-        daft_count = df.collect().to_pydict()["count"][0]
-
-        # Compare with PyIceberg count (should account for deletes)
-        iceberg_count = len(tab.scan().to_arrow())
-
-        assert daft_count == iceberg_count