fix(iceberg): Correct test setup to ensure delete files are created (#5864)

huleilei · root · huleilei · web-flow · commit 050d6e94f0da · 2025-12-22T12:52:34.000-08:00
## Changes Made The integration test TestIcebergCountPushdown.test_count_pushdown_with_delete_files was failing for the test_overlapping_deletes table because it incorrectly enabled count pushdown. The root cause was that the initial Spark write created multiple small data files. Subsequent DELETE operations were optimized by Iceberg to mark entire data files as removed instead of generating position/equality delete files. As a result, Daft's _has_delete_files() check did not find any delete files and incorrectly allowed the count pushdown optimization. This PR fixes the test by adding coalesce(1) to the Spark DataFrame before writing the initial data for the test_overlapping_deletes table. This ensures the data is written to a single Parquet file, forcing subsequent DELETE operations to generate actual delete files. This aligns the test's behavior with its intent, correctly disabling count pushdown when delete files are present. ## Related Issues #5863 5863  --------- Co-authored-by: root <root@bytedance> Co-authored-by: huleilei <huleilei@bytedance>
diff --git a/daft/io/iceberg/iceberg_scan.py b/daft/io/iceberg/iceberg_scan.py
@@ -321,7 +321,7 @@ def can_absorb_select(self) -> bool:
         return True
 
     def supports_count_pushdown(self) -> bool:
-        return True and not self._has_delete_files()
+        return not self._has_delete_files()
 
     def supported_count_modes(self) -> list[CountMode]:
         return [CountMode.All]
diff --git a/tests/integration/iceberg/docker-compose/provision.py b/tests/integration/iceberg/docker-compose/provision.py
@@ -452,27 +452,27 @@
 """
 )
 
-spark.sql(
-    """
-    INSERT INTO default.test_overlapping_deletes
-    VALUES
-        (1, 'Alice', 100.0, 'A'),
-        (2, 'Bob', 200.0, 'B'),
-        (3, 'Charlie', 300.0, 'A'),
-        (4, 'David', 400.0, 'B'),
-        (5, 'Eve', 500.0, 'A'),
-        (6, 'Frank', 600.0, 'B'),
-        (7, 'Grace', 700.0, 'A'),
-        (8, 'Henry', 800.0, 'B'),
-        (9, 'Ivy', 900.0, 'A'),
-        (10, 'Jack', 1000.0, 'B'),
-        (11, 'Kate', 1100.0, 'A'),
-        (12, 'Leo', 1200.0, 'B'),
-        (13, 'Mary', 1300.0, 'A'),
-        (14, 'Nick', 1400.0, 'B'),
-        (15, 'Olivia', 1500.0, 'A');
-"""
-)
+data = [
+    (1, "Alice", 100.0, "A"),
+    (2, "Bob", 200.0, "B"),
+    (3, "Charlie", 300.0, "A"),
+    (4, "David", 400.0, "B"),
+    (5, "Eve", 500.0, "A"),
+    (6, "Frank", 600.0, "B"),
+    (7, "Grace", 700.0, "A"),
+    (8, "Henry", 800.0, "B"),
+    (9, "Ivy", 900.0, "A"),
+    (10, "Jack", 1000.0, "B"),
+    (11, "Kate", 1100.0, "A"),
+    (12, "Leo", 1200.0, "B"),
+    (13, "Mary", 1300.0, "A"),
+    (14, "Nick", 1400.0, "B"),
+    (15, "Olivia", 1500.0, "A"),
+]
+columns = ["id", "name", "value", "category"]
+df = spark.createDataFrame(data, columns)
+df = df.coalesce(1)
+df.writeTo("default.test_overlapping_deletes").append()
 
 spark.sql(
     """