Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions bigframes/core/array_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import functools
import typing
from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
import warnings

import google.cloud.bigquery
import pandas
Expand All @@ -37,7 +36,6 @@
import bigframes.core.tree_properties
from bigframes.core.window_spec import WindowSpec
import bigframes.dtypes
import bigframes.exceptions as bfe
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops

Expand Down Expand Up @@ -101,12 +99,6 @@ def from_table(
):
if offsets_col and primary_key:
raise ValueError("must set at most one of 'offests', 'primary_key'")
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
msg = bfe.format_message(
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
"is a preview feature and subject to change."
)
warnings.warn(msg, bfe.PreviewWarning)
# define data source only for needed columns, this makes row-hashing cheaper
table_def = nodes.GbqTable.from_table(table, columns=schema.names)

Expand Down
33 changes: 33 additions & 0 deletions bigframes/core/backports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Helpers for working across versions of different depenencies."""

from typing import List

import pyarrow


def pyarrow_struct_type_fields(struct_type: pyarrow.StructType) -> List[pyarrow.Field]:
"""StructType.fields was added in pyarrow 18.

See: https://arrow.apache.org/docs/18.0/python/generated/pyarrow.StructType.html
"""

if hasattr(struct_type, "fields"):
return struct_type.fields

return [
struct_type.field(field_index) for field_index in range(struct_type.num_fields)
]
8 changes: 6 additions & 2 deletions bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,16 @@ def shape(self) -> typing.Tuple[int]:

@property
def dtype(self):
return self._block.index.dtypes[0] if self.nlevels == 1 else np.dtype("O")
dtype = self._block.index.dtypes[0] if self.nlevels == 1 else np.dtype("O")
bigframes.dtypes.warn_on_db_dtypes_json_dtype([dtype])
return dtype

@property
def dtypes(self) -> pandas.Series:
dtypes = self._block.index.dtypes
bigframes.dtypes.warn_on_db_dtypes_json_dtype(dtypes)
return pandas.Series(
data=self._block.index.dtypes,
data=dtypes,
index=typing.cast(typing.Tuple, self._block.index.names),
)

Expand Down
4 changes: 3 additions & 1 deletion bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,9 @@ def at(self) -> indexers.AtDataFrameIndexer:

@property
def dtypes(self) -> pandas.Series:
return pandas.Series(data=self._block.dtypes, index=self._block.column_labels)
dtypes = self._block.dtypes
bigframes.dtypes.warn_on_db_dtypes_json_dtype(dtypes)
return pandas.Series(data=dtypes, index=self._block.column_labels)

@property
def columns(self) -> pandas.Index:
Expand Down
50 changes: 46 additions & 4 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import textwrap
import typing
from typing import Any, Dict, List, Literal, Sequence, Union
import warnings

import bigframes_vendored.constants as constants
import db_dtypes # type: ignore
Expand All @@ -30,6 +31,9 @@
import pyarrow as pa
import shapely.geometry # type: ignore

import bigframes.core.backports
import bigframes.exceptions

# Type hints for Pandas dtypes supported by BigQuery DataFrame
Dtype = Union[
pd.BooleanDtype,
Expand Down Expand Up @@ -62,7 +66,8 @@
# No arrow equivalent
GEO_DTYPE = gpd.array.GeometryDtype()
# JSON
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
# TODO(https://github.com/pandas-dev/pandas/issues/60958): switch to
# pyarrow.json_(pyarrow.string()) when pandas 3+ and pyarrow 18+ is installed.
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
OBJ_REF_DTYPE = pd.ArrowDtype(
Expand Down Expand Up @@ -368,8 +373,7 @@ def get_struct_fields(type_: ExpressionType) -> dict[str, Dtype]:
assert isinstance(type_.pyarrow_dtype, pa.StructType)
struct_type = type_.pyarrow_dtype
result: dict[str, Dtype] = {}
for field_no in range(struct_type.num_fields):
field = struct_type.field(field_no)
for field in bigframes.core.backports.pyarrow_struct_type_fields(struct_type):
result[field.name] = arrow_dtype_to_bigframes_dtype(field.type)
return result

Expand Down Expand Up @@ -547,7 +551,8 @@ def arrow_type_to_literal(
return [arrow_type_to_literal(arrow_type.value_type)]
if pa.types.is_struct(arrow_type):
return {
field.name: arrow_type_to_literal(field.type) for field in arrow_type.fields
field.name: arrow_type_to_literal(field.type)
for field in bigframes.core.backports.pyarrow_struct_type_fields(arrow_type)
}
if pa.types.is_string(arrow_type):
return "string"
Expand Down Expand Up @@ -915,3 +920,40 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:


TIMEDELTA_DESCRIPTION_TAG = "#microseconds"


def contains_db_dtypes_json_arrow_type(type_):
if isinstance(type_, db_dtypes.JSONArrowType):
return True

if isinstance(type_, pa.ListType):
return contains_db_dtypes_json_arrow_type(type_.value_type)

if isinstance(type_, pa.StructType):
return any(
contains_db_dtypes_json_arrow_type(field.type)
for field in bigframes.core.backports.pyarrow_struct_type_fields(type_)
)
return False


def contains_db_dtypes_json_dtype(dtype):
if not isinstance(dtype, pd.ArrowDtype):
return False

return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype)


def warn_on_db_dtypes_json_dtype(dtypes):
"""Warn that the JSON dtype is changing.

Note: only call this function if the user is explicitly checking the
dtypes.
"""
if any(contains_db_dtypes_json_dtype(dtype) for dtype in dtypes):
msg = bigframes.exceptions.format_message(
"JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_()) "
"instead of using `db_dtypes` in the future when available in pandas "
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow."
)
warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)
4 changes: 4 additions & 0 deletions bigframes/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ class FunctionAxisOnePreviewWarning(PreviewWarning):
"""Remote Function and Managed UDF with axis=1 preview."""


class JSONDtypeWarning(PreviewWarning):
"""JSON dtype will be pd.ArrowDtype(pa.json_()) in the future."""


class FunctionConflictTypeHintWarning(UserWarning):
"""Conflicting type hints in a BigFrames function."""

Expand Down
2 changes: 2 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,12 @@ def dt(self) -> dt.DatetimeMethods:

@property
def dtype(self):
bigframes.dtypes.warn_on_db_dtypes_json_dtype([self._dtype])
return self._dtype

@property
def dtypes(self):
bigframes.dtypes.warn_on_db_dtypes_json_dtype([self._dtype])
return self._dtype

@property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import pandas as pd
import pyarrow as pa # type: ignore
import pytest
import shapely.geometry # type: ignore

import bigframes.core.compile.ibis_types
import bigframes.dtypes
Expand Down Expand Up @@ -225,22 +224,6 @@ def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str):
assert result == ibis_dtype


@pytest.mark.parametrize(
["python_type", "expected_dtype"],
[
(bool, bigframes.dtypes.BOOL_DTYPE),
(int, bigframes.dtypes.INT_DTYPE),
(str, bigframes.dtypes.STRING_DTYPE),
(shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE),
(shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE),
(shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE),
],
)
def test_bigframes_type_supports_python_types(python_type, expected_dtype):
got_dtype = bigframes.dtypes.bigframes_type(python_type)
assert got_dtype == expected_dtype


def test_unsupported_dtype_raises_unexpected_datatype():
"""Incompatible dtypes should fail when passed into BigQuery DataFrames"""
with pytest.raises(ValueError, match="Datatype has no ibis type mapping"):
Expand All @@ -265,19 +248,3 @@ def test_literal_to_ibis_scalar_converts(literal, ibis_scalar):
assert bigframes.core.compile.ibis_types.literal_to_ibis_scalar(literal).equals(
ibis_scalar
)


@pytest.mark.parametrize(
["scalar", "expected_dtype"],
[
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
# Support NULL scalars.
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
],
)
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype
73 changes: 73 additions & 0 deletions tests/unit/test_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import db_dtypes # type: ignore
import pyarrow as pa # type: ignore
import pytest
import shapely.geometry # type: ignore

import bigframes.dtypes


@pytest.mark.parametrize(
["python_type", "expected_dtype"],
[
(bool, bigframes.dtypes.BOOL_DTYPE),
(int, bigframes.dtypes.INT_DTYPE),
(str, bigframes.dtypes.STRING_DTYPE),
(shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE),
(shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE),
(shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE),
],
)
def test_bigframes_type_supports_python_types(python_type, expected_dtype):
got_dtype = bigframes.dtypes.bigframes_type(python_type)
assert got_dtype == expected_dtype


@pytest.mark.parametrize(
["scalar", "expected_dtype"],
[
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
# Support NULL scalars.
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
],
)
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype


@pytest.mark.parametrize(
["type_", "expected"],
[
(pa.int64(), False),
(db_dtypes.JSONArrowType(), True),
(pa.struct([("int", pa.int64()), ("str", pa.string())]), False),
(pa.struct([("int", pa.int64()), ("json", db_dtypes.JSONArrowType())]), True),
(pa.list_(pa.int64()), False),
(pa.list_(db_dtypes.JSONArrowType()), True),
(
pa.list_(
pa.struct([("int", pa.int64()), ("json", db_dtypes.JSONArrowType())])
),
True,
),
],
)
def test_contains_db_dtypes_json_arrow_type(type_, expected):
assert bigframes.dtypes.contains_db_dtypes_json_arrow_type(type_) == expected