Skip to content

Commit 62f7e9f

Browse files
authored
fix!: turn Series.struct.dtypes into a property to match pandas (#2169)
Also, implement some struct operations in the polars compiler.
1 parent a6f87a0 commit 62f7e9f

File tree

6 files changed

+210
-16
lines changed

6 files changed

+210
-16
lines changed

bigframes/core/compile/polars/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
# polars shouldn't be needed at import time, as register is a no-op if polars
2525
# isn't installed.
2626
import bigframes.core.compile.polars.operations.generic_ops # noqa: F401
27+
import bigframes.core.compile.polars.operations.struct_ops # noqa: F401
2728

2829
try:
2930
import bigframes._importing
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
BigFrames -> Polars compilation for the operations in bigframes.operations.generic_ops.
17+
18+
Please keep implementations in sequential order by op name.
19+
"""
20+
21+
from __future__ import annotations
22+
23+
from typing import TYPE_CHECKING
24+
25+
import bigframes_vendored.constants
26+
27+
import bigframes.core.compile.polars.compiler as polars_compiler
28+
from bigframes.operations import struct_ops
29+
30+
if TYPE_CHECKING:
31+
import polars as pl
32+
33+
34+
@polars_compiler.register_op(struct_ops.StructFieldOp)
35+
def struct_field_op_impl(
36+
compiler: polars_compiler.PolarsExpressionCompiler,
37+
op: struct_ops.StructFieldOp, # type: ignore
38+
input: pl.Expr,
39+
) -> pl.Expr:
40+
if isinstance(op.name_or_index, str):
41+
name = op.name_or_index
42+
else:
43+
raise NotImplementedError(
44+
"Referencing a struct field by number not implemented in polars compiler. "
45+
f"{bigframes_vendored.constants.FEEDBACK_LINK}"
46+
)
47+
48+
return input.struct.field(name)

bigframes/operations/structs.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@
1717
import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors
1818
import pandas as pd
1919

20-
from bigframes.core import log_adapter
20+
from bigframes.core import backports, log_adapter
2121
import bigframes.dataframe
22-
import bigframes.dtypes
2322
import bigframes.operations
2423
import bigframes.operations.base
2524
import bigframes.series
@@ -45,17 +44,24 @@ def explode(self) -> bigframes.dataframe.DataFrame:
4544

4645
pa_type = self._dtype.pyarrow_dtype
4746
return bigframes.pandas.concat(
48-
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
47+
[
48+
self.field(field.name)
49+
for field in backports.pyarrow_struct_type_fields(pa_type)
50+
],
51+
axis="columns",
4952
)
5053

54+
@property
5155
def dtypes(self) -> pd.Series:
5256
pa_type = self._dtype.pyarrow_dtype
5357
return pd.Series(
5458
data=[
55-
bigframes.dtypes.arrow_dtype_to_bigframes_dtype(pa_type.field(i).type)
56-
for i in range(pa_type.num_fields)
59+
pd.ArrowDtype(field.type)
60+
for field in backports.pyarrow_struct_type_fields(pa_type)
61+
],
62+
index=[
63+
field.name for field in backports.pyarrow_struct_type_fields(pa_type)
5764
],
58-
index=[pa_type.field(i).name for i in range(pa_type.num_fields)],
5965
)
6066

6167

notebooks/data_types/struct.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -211,11 +211,11 @@
211211
{
212212
"data": {
213213
"text/plain": [
214-
"0 [{'tables': {'score': 0.9349926710128784, 'val...\n",
215-
"1 [{'tables': {'score': 0.9690881371498108, 'val...\n",
216-
"2 [{'tables': {'score': 0.8667634129524231, 'val...\n",
217-
"3 [{'tables': {'score': 0.9351968765258789, 'val...\n",
218-
"4 [{'tables': {'score': 0.8572560548782349, 'val...\n",
214+
"0 [{'tables': {'score': 0.8667634129524231, 'val...\n",
215+
"1 [{'tables': {'score': 0.9351968765258789, 'val...\n",
216+
"2 [{'tables': {'score': 0.8572560548782349, 'val...\n",
217+
"3 [{'tables': {'score': 0.9690881371498108, 'val...\n",
218+
"4 [{'tables': {'score': 0.9349926710128784, 'val...\n",
219219
"Name: predicted_default_payment_next_month, dtype: list<item: struct<tables: struct<score: double, value: string>>>[pyarrow]"
220220
]
221221
},
@@ -267,7 +267,7 @@
267267
}
268268
],
269269
"source": [
270-
"df['Address'].struct.dtypes()"
270+
"df['Address'].struct.dtypes"
271271
]
272272
},
273273
{
@@ -461,7 +461,7 @@
461461
],
462462
"metadata": {
463463
"kernelspec": {
464-
"display_name": "Python 3",
464+
"display_name": "venv",
465465
"language": "python",
466466
"name": "python3"
467467
},
@@ -475,7 +475,7 @@
475475
"name": "python",
476476
"nbconvert_exporter": "python",
477477
"pygments_lexer": "ipython3",
478-
"version": "3.12.1"
478+
"version": "3.12.9"
479479
}
480480
},
481481
"nbformat": 4,

tests/unit/test_series_struct.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import pathlib
18+
from typing import Generator, TYPE_CHECKING
19+
20+
import pandas as pd
21+
import pandas.testing
22+
import pyarrow as pa # type: ignore
23+
import pytest
24+
25+
import bigframes
26+
27+
if TYPE_CHECKING:
28+
from bigframes.testing import polars_session
29+
30+
pytest.importorskip("polars")
31+
pytest.importorskip("pandas", minversion="2.2.0")
32+
33+
CURRENT_DIR = pathlib.Path(__file__).parent
34+
DATA_DIR = CURRENT_DIR.parent / "data"
35+
36+
37+
@pytest.fixture(scope="module", autouse=True)
38+
def session() -> Generator[bigframes.Session, None, None]:
39+
import bigframes.core.global_session
40+
from bigframes.testing import polars_session
41+
42+
session = polars_session.TestSession()
43+
with bigframes.core.global_session._GlobalSessionContext(session):
44+
yield session
45+
46+
47+
@pytest.fixture
48+
def struct_df(session: polars_session.TestSession):
49+
pa_type = pa.struct(
50+
[
51+
("str_field", pa.string()),
52+
("int_field", pa.int64()),
53+
]
54+
)
55+
return session.DataFrame(
56+
{
57+
"struct_col": pd.Series(
58+
pa.array(
59+
[
60+
{
61+
"str_field": "my string",
62+
"int_field": 1,
63+
},
64+
{
65+
"str_field": None,
66+
"int_field": 2,
67+
},
68+
{
69+
"str_field": "another string",
70+
"int_field": None,
71+
},
72+
{
73+
"str_field": "some string",
74+
"int_field": 3,
75+
},
76+
],
77+
pa_type,
78+
),
79+
dtype=pd.ArrowDtype(pa_type),
80+
),
81+
}
82+
)
83+
84+
85+
@pytest.fixture
86+
def struct_series(struct_df):
87+
return struct_df["struct_col"]
88+
89+
90+
def test_struct_dtypes(struct_series):
91+
bf_series = struct_series
92+
pd_series = struct_series.to_pandas()
93+
assert isinstance(pd_series.dtype, pd.ArrowDtype)
94+
95+
bf_result = bf_series.struct.dtypes
96+
pd_result = pd_series.struct.dtypes
97+
98+
pandas.testing.assert_series_equal(bf_result, pd_result)
99+
100+
101+
@pytest.mark.parametrize(
102+
("field_name", "common_dtype"),
103+
(
104+
("str_field", "string[pyarrow]"),
105+
("int_field", "int64[pyarrow]"),
106+
# TODO(tswast): Support referencing fields by number, too.
107+
),
108+
)
109+
def test_struct_field(struct_series, field_name, common_dtype):
110+
bf_series = struct_series
111+
pd_series = struct_series.to_pandas()
112+
assert isinstance(pd_series.dtype, pd.ArrowDtype)
113+
114+
bf_result = bf_series.struct.field(field_name).to_pandas()
115+
pd_result = pd_series.struct.field(field_name)
116+
117+
# TODO(tswast): if/when we support arrowdtype for int/string, we can remove
118+
# this cast.
119+
bf_result = bf_result.astype(common_dtype)
120+
pd_result = pd_result.astype(common_dtype)
121+
122+
pandas.testing.assert_series_equal(bf_result, pd_result)
123+
124+
125+
def test_struct_explode(struct_series):
126+
bf_series = struct_series
127+
pd_series = struct_series.to_pandas()
128+
assert isinstance(pd_series.dtype, pd.ArrowDtype)
129+
130+
bf_result = bf_series.struct.explode().to_pandas()
131+
pd_result = pd_series.struct.explode()
132+
133+
pandas.testing.assert_frame_equal(
134+
bf_result,
135+
pd_result,
136+
# TODO(tswast): remove if/when we support arrowdtype for int/string.
137+
check_dtype=False,
138+
)

third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def explode(self):
158158
"""
159159
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
160160

161+
@property
161162
def dtypes(self):
162163
"""
163164
Return the dtype object of each child field of the struct.
@@ -177,8 +178,8 @@ def dtypes(self):
177178
... [("version", pa.int64()), ("project", pa.string())]
178179
... ))
179180
... )
180-
>>> s.struct.dtypes()
181-
version Int64
181+
>>> s.struct.dtypes
182+
version int64[pyarrow]
182183
project string[pyarrow]
183184
dtype: object
184185

0 commit comments

Comments
 (0)