Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bigframes/core/compile/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# polars shouldn't be needed at import time, as register is a no-op if polars
# isn't installed.
import bigframes.core.compile.polars.operations.generic_ops # noqa: F401
import bigframes.core.compile.polars.operations.struct_ops # noqa: F401

try:
import bigframes._importing
Expand Down
48 changes: 48 additions & 0 deletions bigframes/core/compile/polars/operations/struct_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
BigFrames -> Polars compilation for the operations in bigframes.operations.generic_ops.

Please keep implementations in sequential order by op name.
"""

from __future__ import annotations

from typing import TYPE_CHECKING

import bigframes_vendored.constants

import bigframes.core.compile.polars.compiler as polars_compiler
from bigframes.operations import struct_ops

if TYPE_CHECKING:
import polars as pl


@polars_compiler.register_op(struct_ops.StructFieldOp)
def struct_field_op_impl(
compiler: polars_compiler.PolarsExpressionCompiler,
op: struct_ops.StructFieldOp, # type: ignore
input: pl.Expr,
) -> pl.Expr:
if isinstance(op.name_or_index, str):
name = op.name_or_index
else:
raise NotImplementedError(
"Referencing a struct field by number not implemented in polars compiler. "
f"{bigframes_vendored.constants.FEEDBACK_LINK}"
)

return input.struct.field(name)
18 changes: 12 additions & 6 deletions bigframes/operations/structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@
import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors
import pandas as pd

from bigframes.core import log_adapter
from bigframes.core import backports, log_adapter
import bigframes.dataframe
import bigframes.dtypes
import bigframes.operations
import bigframes.operations.base
import bigframes.series
Expand All @@ -45,17 +44,24 @@ def explode(self) -> bigframes.dataframe.DataFrame:

pa_type = self._dtype.pyarrow_dtype
return bigframes.pandas.concat(
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
[
self.field(field.name)
for field in backports.pyarrow_struct_type_fields(pa_type)
],
axis="columns",
)

@property
def dtypes(self) -> pd.Series:
pa_type = self._dtype.pyarrow_dtype
return pd.Series(
data=[
bigframes.dtypes.arrow_dtype_to_bigframes_dtype(pa_type.field(i).type)
for i in range(pa_type.num_fields)
pd.ArrowDtype(field.type)
for field in backports.pyarrow_struct_type_fields(pa_type)
],
index=[
field.name for field in backports.pyarrow_struct_type_fields(pa_type)
],
index=[pa_type.field(i).name for i in range(pa_type.num_fields)],
)


Expand Down
16 changes: 8 additions & 8 deletions notebooks/data_types/struct.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,11 @@
{
"data": {
"text/plain": [
"0 [{'tables': {'score': 0.9349926710128784, 'val...\n",
"1 [{'tables': {'score': 0.9690881371498108, 'val...\n",
"2 [{'tables': {'score': 0.8667634129524231, 'val...\n",
"3 [{'tables': {'score': 0.9351968765258789, 'val...\n",
"4 [{'tables': {'score': 0.8572560548782349, 'val...\n",
"0 [{'tables': {'score': 0.8667634129524231, 'val...\n",
"1 [{'tables': {'score': 0.9351968765258789, 'val...\n",
"2 [{'tables': {'score': 0.8572560548782349, 'val...\n",
"3 [{'tables': {'score': 0.9690881371498108, 'val...\n",
"4 [{'tables': {'score': 0.9349926710128784, 'val...\n",
"Name: predicted_default_payment_next_month, dtype: list<item: struct<tables: struct<score: double, value: string>>>[pyarrow]"
]
},
Expand Down Expand Up @@ -267,7 +267,7 @@
}
],
"source": [
"df['Address'].struct.dtypes()"
"df['Address'].struct.dtypes"
]
},
{
Expand Down Expand Up @@ -461,7 +461,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "venv",
"language": "python",
"name": "python3"
},
Expand All @@ -475,7 +475,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
"version": "3.12.9"
}
},
"nbformat": 4,
Expand Down
138 changes: 138 additions & 0 deletions tests/unit/test_series_struct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import pathlib
from typing import Generator, TYPE_CHECKING

import pandas as pd
import pandas.testing
import pyarrow as pa # type: ignore
import pytest

import bigframes

if TYPE_CHECKING:
from bigframes.testing import polars_session

pytest.importorskip("polars")
pytest.importorskip("pandas", minversion="2.2.0")

CURRENT_DIR = pathlib.Path(__file__).parent
DATA_DIR = CURRENT_DIR.parent / "data"


@pytest.fixture(scope="module", autouse=True)
def session() -> Generator[bigframes.Session, None, None]:
import bigframes.core.global_session
from bigframes.testing import polars_session

session = polars_session.TestSession()
with bigframes.core.global_session._GlobalSessionContext(session):
yield session


@pytest.fixture
def struct_df(session: polars_session.TestSession):
pa_type = pa.struct(
[
("str_field", pa.string()),
("int_field", pa.int64()),
]
)
return session.DataFrame(
{
"struct_col": pd.Series(
pa.array(
[
{
"str_field": "my string",
"int_field": 1,
},
{
"str_field": None,
"int_field": 2,
},
{
"str_field": "another string",
"int_field": None,
},
{
"str_field": "some string",
"int_field": 3,
},
],
pa_type,
),
dtype=pd.ArrowDtype(pa_type),
),
}
)


@pytest.fixture
def struct_series(struct_df):
return struct_df["struct_col"]


def test_struct_dtypes(struct_series):
bf_series = struct_series
pd_series = struct_series.to_pandas()
assert isinstance(pd_series.dtype, pd.ArrowDtype)

bf_result = bf_series.struct.dtypes
pd_result = pd_series.struct.dtypes

pandas.testing.assert_series_equal(bf_result, pd_result)


@pytest.mark.parametrize(
("field_name", "common_dtype"),
(
("str_field", "string[pyarrow]"),
("int_field", "int64[pyarrow]"),
# TODO(tswast): Support referencing fields by number, too.
),
)
def test_struct_field(struct_series, field_name, common_dtype):
bf_series = struct_series
pd_series = struct_series.to_pandas()
assert isinstance(pd_series.dtype, pd.ArrowDtype)

bf_result = bf_series.struct.field(field_name).to_pandas()
pd_result = pd_series.struct.field(field_name)

# TODO(tswast): if/when we support arrowdtype for int/string, we can remove
# this cast.
bf_result = bf_result.astype(common_dtype)
pd_result = pd_result.astype(common_dtype)

pandas.testing.assert_series_equal(bf_result, pd_result)


def test_struct_explode(struct_series):
bf_series = struct_series
pd_series = struct_series.to_pandas()
assert isinstance(pd_series.dtype, pd.ArrowDtype)

bf_result = bf_series.struct.explode().to_pandas()
pd_result = pd_series.struct.explode()

pandas.testing.assert_frame_equal(
bf_result,
pd_result,
# TODO(tswast): remove if/when we support arrowdtype for int/string.
check_dtype=False,
)
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def explode(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def dtypes(self):
"""
Return the dtype object of each child field of the struct.
Expand All @@ -177,8 +178,8 @@ def dtypes(self):
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.dtypes()
version Int64
>>> s.struct.dtypes
version int64[pyarrow]
project string[pyarrow]
dtype: object

Expand Down