feat: include local data bytes in the dry run report when available (#2185)

sycai · web-flow · commit ee2c40c67895 · 2025-10-22T15:41:54.000Z
* feat: include local data bytes in the dry run report when available

* fix test
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -967,7 +967,7 @@ def _compute_dry_run(
         }
 
         dry_run_stats = dry_runs.get_query_stats_with_dtypes(
-            query_job, column_dtypes, self.index.dtypes
+            query_job, column_dtypes, self.index.dtypes, self.expr.node
         )
         return dry_run_stats, query_job
 
diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py
@@ -20,6 +20,7 @@
 import pandas
 
 from bigframes import dtypes
+from bigframes.core import bigframe_node, nodes
 
 
 def get_table_stats(table: bigquery.Table) -> pandas.Series:
@@ -86,13 +87,26 @@ def get_query_stats_with_dtypes(
     query_job: bigquery.QueryJob,
     column_dtypes: Dict[str, dtypes.Dtype],
     index_dtypes: Sequence[dtypes.Dtype],
+    expr_root: bigframe_node.BigFrameNode | None = None,
 ) -> pandas.Series:
+    """
+    Returns important stats from the query job as a Pandas Series. The dtypes information is added too.
+
+    Args:
+        expr_root (Optional):
+            The root of the expression tree that may contain local data, whose size is added to the
+            total bytes count if available.
+
+    """
     index = ["columnCount", "columnDtypes", "indexLevel", "indexDtypes"]
     values = [len(column_dtypes), column_dtypes, len(index_dtypes), index_dtypes]
 
     s = pandas.Series(values, index=index)
 
-    return pandas.concat([s, get_query_stats(query_job)])
+    result = pandas.concat([s, get_query_stats(query_job)])
+    if expr_root is not None:
+        result["totalBytesProcessed"] += get_local_bytes(expr_root)
+    return result
 
 
 def get_query_stats(
@@ -145,4 +159,24 @@ def get_query_stats(
         else None
     )
 
-    return pandas.Series(values, index=index)
+    result = pandas.Series(values, index=index)
+    if result["totalBytesProcessed"] is None:
+        result["totalBytesProcessed"] = 0
+    else:
+        result["totalBytesProcessed"] = int(result["totalBytesProcessed"])
+
+    return result
+
+
+def get_local_bytes(root: bigframe_node.BigFrameNode) -> int:
+    def get_total_bytes(
+        root: bigframe_node.BigFrameNode, child_results: tuple[int, ...]
+    ) -> int:
+        child_bytes = sum(child_results)
+
+        if isinstance(root, nodes.ReadLocalNode):
+            return child_bytes + root.local_data_source.data.get_total_buffer_size()
+
+        return child_bytes
+
+    return root.reduce_up(get_total_bytes)
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -2173,6 +2173,22 @@ def test_read_gbq_query_dry_run(scalars_table_id, session):
     _assert_query_dry_run_stats_are_valid(result)
 
 
+def test_block_dry_run_includes_local_data(session):
+    df1 = bigframes.dataframe.DataFrame({"col_1": [1, 2, 3]}, session=session)
+    df2 = bigframes.dataframe.DataFrame({"col_2": [1, 2, 3]}, session=session)
+
+    result = df1.merge(df2, how="cross").to_pandas(dry_run=True)
+
+    assert isinstance(result, pd.Series)
+    _assert_query_dry_run_stats_are_valid(result)
+    assert result["totalBytesProcessed"] > 0
+    assert (
+        df1.to_pandas(dry_run=True)["totalBytesProcessed"]
+        + df2.to_pandas(dry_run=True)["totalBytesProcessed"]
+        == result["totalBytesProcessed"]
+    )
+
+
 def _assert_query_dry_run_stats_are_valid(result: pd.Series):
     expected_index = pd.Index(
         [

Original file line number	Diff line number	Diff line change
`@@ -967,7 +967,7 @@ def _compute_dry_run(`
`967`	`967`	`}`
`968`	`968`
`969`	`969`	`dry_run_stats = dry_runs.get_query_stats_with_dtypes(`
`970`		`- query_job, column_dtypes, self.index.dtypes`
	`970`	`+ query_job, column_dtypes, self.index.dtypes, self.expr.node`
`971`	`971`	`)`
`972`	`972`	`return dry_run_stats, query_job`
`973`	`973`