fix: support results with STRUCT and ARRAY columns containing JSON subfields in to_pandas_batches() (#2216)

shuoweil · web-flow · commit 3d8b17fa5eb9 · 2025-11-03T17:27:14.000-08:00
* Correctly display DataFrames with JSON columns in anywidget * Improve JSON type handling for to_gbq and to_pandas_batches * Revert "Correctly display DataFrames with JSON columns in anywidget" This reverts commit 8c34512. * Remove unnecessary comment * code refactor * testcase update * Fix testcase * function call updated in bigframes/core/blocks.py, unused function removed from bigframes/dtypes.py * revert the code refactor in loader.py, I will use a seperate pr for this refactor * replace the manual construction of the empty DataFrame with the more robust try...except block that leverages to_pyarrow and empty_table * fix testcase * existing arrow_to_pandas() helper that properly handles dtype conversion * testcase update * refactor testcase * Add pyarrow id to comments
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -68,6 +68,7 @@
 import bigframes.operations.aggregations as agg_ops
 from bigframes.session import dry_runs, execution_spec
 from bigframes.session import executor as executors
+from bigframes.session._io import pandas as io_pandas
 
 # Type constraint for wherever column labels are used
 Label = typing.Hashable
@@ -711,12 +712,15 @@ def to_pandas_batches(
         # To reduce the number of edge cases to consider when working with the
         # results of this, always return at least one DataFrame. See:
         # b/428918844.
-        empty_val = pd.DataFrame(
-            {
-                col: pd.Series([], dtype=self.expr.get_column_type(col))
-                for col in itertools.chain(self.value_columns, self.index_columns)
-            }
-        )
+        try:
+            empty_arrow_table = self.expr.schema.to_pyarrow().empty_table()
+        except pa.ArrowNotImplementedError:
+            # Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262),
+            # empty_table only supports base storage types, not extension types.
+            empty_arrow_table = self.expr.schema.to_pyarrow(
+                use_storage_types=True
+            ).empty_table()
+        empty_val = io_pandas.arrow_to_pandas(empty_arrow_table, self.expr.schema)
         dfs = map(
             lambda a: a[0],
             itertools.zip_longest(
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -376,6 +376,92 @@ def test_to_pandas_batches_w_empty_dataframe(session):
     pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes)
 
 
+@pytest.mark.skipif(
+    bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
+    reason="Test for pandas 1.x behavior only",
+)
+def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas1(session):
+    """Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 1.x."""
+    sql = """
+        SELECT
+            0 AS id,
+            [JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
+            STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
+    """
+    df = session.read_gbq(sql, index_col="id")
+    batches = list(df.to_pandas_batches())
+
+    assert batches[0].dtypes["json_array"] == "object"
+    assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
+
+
+@pytest.mark.skipif(
+    not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
+    reason="Test for pandas 2.x behavior only",
+)
+def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas2(session):
+    """Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 2.x."""
+    sql = """
+        SELECT
+            0 AS id,
+            [JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
+            STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
+    """
+    df = session.read_gbq(sql, index_col="id")
+    batches = list(df.to_pandas_batches())
+
+    assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
+    assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType)
+    assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
+
+
+@pytest.mark.skipif(
+    bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
+    reason="Test for pandas 1.x behavior only",
+)
+def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas1(session):
+    """Verify to_pandas_batches() works with empty nested JSON types in pandas 1.x."""
+
+    sql = """
+        SELECT
+            1 AS id,
+            [] AS json_array,
+            STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
+    """
+    df = session.read_gbq(sql, index_col="id")
+
+    # The main point: this should not raise an error
+    batches = list(df.to_pandas_batches())
+    assert sum(len(b) for b in batches) == 1
+
+    assert batches[0].dtypes["json_array"] == "object"
+    assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
+
+
+@pytest.mark.skipif(
+    not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
+    reason="Test for pandas 2.x behavior only",
+)
+def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas2(session):
+    """Verify to_pandas_batches() works with empty nested JSON types in pandas 2.x."""
+
+    sql = """
+        SELECT
+            1 AS id,
+            [] AS json_array,
+            STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
+    """
+    df = session.read_gbq(sql, index_col="id")
+
+    # The main point: this should not raise an error
+    batches = list(df.to_pandas_batches())
+    assert sum(len(b) for b in batches) == 1
+
+    assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
+    assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
+    assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType)
+
+
 @pytest.mark.parametrize("allow_large_results", (True, False))
 def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
     """Verify to_pandas_batches() APIs returns the expected page size.