Skip to content

Commit 3d8b17f

Browse files
authored
fix: support results with STRUCT and ARRAY columns containing JSON subfields in to_pandas_batches() (#2216)
* Correctly display DataFrames with JSON columns in anywidget * Improve JSON type handling for to_gbq and to_pandas_batches * Revert "Correctly display DataFrames with JSON columns in anywidget" This reverts commit 8c34512. * Remove unnecessary comment * code refactor * testcase update * Fix testcase * function call updated in bigframes/core/blocks.py, unused function removed from bigframes/dtypes.py * revert the code refactor in loader.py, I will use a seperate pr for this refactor * replace the manual construction of the empty DataFrame with the more robust try...except block that leverages to_pyarrow and empty_table * fix testcase * existing arrow_to_pandas() helper that properly handles dtype conversion * testcase update * refactor testcase * Add pyarrow id to comments
1 parent 94c8b3c commit 3d8b17f

File tree

2 files changed

+96
-6
lines changed

2 files changed

+96
-6
lines changed

bigframes/core/blocks.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
import bigframes.operations.aggregations as agg_ops
6969
from bigframes.session import dry_runs, execution_spec
7070
from bigframes.session import executor as executors
71+
from bigframes.session._io import pandas as io_pandas
7172

7273
# Type constraint for wherever column labels are used
7374
Label = typing.Hashable
@@ -711,12 +712,15 @@ def to_pandas_batches(
711712
# To reduce the number of edge cases to consider when working with the
712713
# results of this, always return at least one DataFrame. See:
713714
# b/428918844.
714-
empty_val = pd.DataFrame(
715-
{
716-
col: pd.Series([], dtype=self.expr.get_column_type(col))
717-
for col in itertools.chain(self.value_columns, self.index_columns)
718-
}
719-
)
715+
try:
716+
empty_arrow_table = self.expr.schema.to_pyarrow().empty_table()
717+
except pa.ArrowNotImplementedError:
718+
# Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262),
719+
# empty_table only supports base storage types, not extension types.
720+
empty_arrow_table = self.expr.schema.to_pyarrow(
721+
use_storage_types=True
722+
).empty_table()
723+
empty_val = io_pandas.arrow_to_pandas(empty_arrow_table, self.expr.schema)
720724
dfs = map(
721725
lambda a: a[0],
722726
itertools.zip_longest(

tests/system/small/test_dataframe_io.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,92 @@ def test_to_pandas_batches_w_empty_dataframe(session):
376376
pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes)
377377

378378

379+
@pytest.mark.skipif(
380+
bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
381+
reason="Test for pandas 1.x behavior only",
382+
)
383+
def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas1(session):
384+
"""Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 1.x."""
385+
sql = """
386+
SELECT
387+
0 AS id,
388+
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
389+
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
390+
"""
391+
df = session.read_gbq(sql, index_col="id")
392+
batches = list(df.to_pandas_batches())
393+
394+
assert batches[0].dtypes["json_array"] == "object"
395+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
396+
397+
398+
@pytest.mark.skipif(
399+
not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
400+
reason="Test for pandas 2.x behavior only",
401+
)
402+
def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas2(session):
403+
"""Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 2.x."""
404+
sql = """
405+
SELECT
406+
0 AS id,
407+
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
408+
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
409+
"""
410+
df = session.read_gbq(sql, index_col="id")
411+
batches = list(df.to_pandas_batches())
412+
413+
assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
414+
assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType)
415+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
416+
417+
418+
@pytest.mark.skipif(
419+
bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
420+
reason="Test for pandas 1.x behavior only",
421+
)
422+
def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas1(session):
423+
"""Verify to_pandas_batches() works with empty nested JSON types in pandas 1.x."""
424+
425+
sql = """
426+
SELECT
427+
1 AS id,
428+
[] AS json_array,
429+
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
430+
"""
431+
df = session.read_gbq(sql, index_col="id")
432+
433+
# The main point: this should not raise an error
434+
batches = list(df.to_pandas_batches())
435+
assert sum(len(b) for b in batches) == 1
436+
437+
assert batches[0].dtypes["json_array"] == "object"
438+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
439+
440+
441+
@pytest.mark.skipif(
442+
not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
443+
reason="Test for pandas 2.x behavior only",
444+
)
445+
def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas2(session):
446+
"""Verify to_pandas_batches() works with empty nested JSON types in pandas 2.x."""
447+
448+
sql = """
449+
SELECT
450+
1 AS id,
451+
[] AS json_array,
452+
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
453+
"""
454+
df = session.read_gbq(sql, index_col="id")
455+
456+
# The main point: this should not raise an error
457+
batches = list(df.to_pandas_batches())
458+
assert sum(len(b) for b in batches) == 1
459+
460+
assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
461+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
462+
assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType)
463+
464+
379465
@pytest.mark.parametrize("allow_large_results", (True, False))
380466
def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
381467
"""Verify to_pandas_batches() APIs returns the expected page size.

0 commit comments

Comments
 (0)