Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions bigframes/core/array_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import functools
import typing
from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
import warnings

import google.cloud.bigquery
import pandas
Expand All @@ -37,7 +36,6 @@
import bigframes.core.tree_properties
from bigframes.core.window_spec import WindowSpec
import bigframes.dtypes
import bigframes.exceptions as bfe
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops

Expand Down Expand Up @@ -101,12 +99,6 @@ def from_table(
):
if offsets_col and primary_key:
raise ValueError("must set at most one of 'offests', 'primary_key'")
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
msg = bfe.format_message(
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
"is a preview feature and subject to change."
)
warnings.warn(msg, bfe.PreviewWarning)
# define data source only for needed columns, this makes row-hashing cheaper
table_def = nodes.GbqTable.from_table(table, columns=schema.names)

Expand Down
8 changes: 6 additions & 2 deletions bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,16 @@ def shape(self) -> typing.Tuple[int]:

@property
def dtype(self):
return self._block.index.dtypes[0] if self.nlevels == 1 else np.dtype("O")
dtype = self._block.index.dtypes[0] if self.nlevels == 1 else np.dtype("O")
bigframes.dtypes.warn_on_db_dtypes_json_dtype([dtype])
return dtype

@property
def dtypes(self) -> pandas.Series:
dtypes = self._block.index.dtypes
bigframes.dtypes.warn_on_db_dtypes_json_dtype(dtypes)
return pandas.Series(
data=self._block.index.dtypes,
data=dtypes,
index=typing.cast(typing.Tuple, self._block.index.names),
)

Expand Down
4 changes: 3 additions & 1 deletion bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,9 @@ def at(self) -> indexers.AtDataFrameIndexer:

@property
def dtypes(self) -> pandas.Series:
return pandas.Series(data=self._block.dtypes, index=self._block.column_labels)
dtypes = self._block.dtypes
bigframes.dtypes.warn_on_db_dtypes_json_dtype(dtypes)
return pandas.Series(data=dtypes, index=self._block.column_labels)

@property
def columns(self) -> pandas.Index:
Expand Down
42 changes: 41 additions & 1 deletion bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import textwrap
import typing
from typing import Any, Dict, List, Literal, Sequence, Union
import warnings

import bigframes_vendored.constants as constants
import db_dtypes # type: ignore
Expand All @@ -30,6 +31,8 @@
import pyarrow as pa
import shapely.geometry # type: ignore

import bigframes.exceptions

# Type hints for Pandas dtypes supported by BigQuery DataFrame
Dtype = Union[
pd.BooleanDtype,
Expand Down Expand Up @@ -62,7 +65,8 @@
# No arrow equivalent
GEO_DTYPE = gpd.array.GeometryDtype()
# JSON
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
# TODO(https://github.com/pandas-dev/pandas/issues/60958): switch to
# pyarrow.json_(pyarrow.string()) when pandas 3+ and pyarrow 18+ is installed.
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
OBJ_REF_DTYPE = pd.ArrowDtype(
Expand Down Expand Up @@ -915,3 +919,39 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:


TIMEDELTA_DESCRIPTION_TAG = "#microseconds"


def contains_db_dtypes_json_arrow_type(type_):
if isinstance(type_, db_dtypes.JSONArrowType):
return True

if isinstance(type_, pa.ListType):
return contains_db_dtypes_json_arrow_type(type_.value_type)

if isinstance(type_, pa.StructType):
return any(
contains_db_dtypes_json_arrow_type(field.type) for field in type_.fields
)
return False


def contains_db_dtypes_json_dtype(dtype):
if not isinstance(dtype, pd.ArrowDtype):
return False

return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype)


def warn_on_db_dtypes_json_dtype(dtypes):
"""Warn that the JSON dtype is changing.

Note: only call this function if the user is explicitly checking the
dtypes.
"""
if any(contains_db_dtypes_json_dtype(dtype) for dtype in dtypes):
msg = bigframes.exceptions.format_message(
"JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_()) "
"instead of using `db_dtypes` in the future when available in pandas "
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow."
)
warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)
4 changes: 4 additions & 0 deletions bigframes/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ class FunctionAxisOnePreviewWarning(PreviewWarning):
"""Remote Function and Managed UDF with axis=1 preview."""


class JSONDtypeWarning(PreviewWarning):
"""JSON dtype will be pd.ArrowDtype(pa.json_()) in the future."""


class FunctionConflictTypeHintWarning(UserWarning):
"""Conflicting type hints in a BigFrames function."""

Expand Down
2 changes: 2 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,12 @@ def dt(self) -> dt.DatetimeMethods:

@property
def dtype(self):
bigframes.dtypes.warn_on_db_dtypes_json_dtype([self._dtype])
return self._dtype

@property
def dtypes(self):
bigframes.dtypes.warn_on_db_dtypes_json_dtype([self._dtype])
return self._dtype

@property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import pandas as pd
import pyarrow as pa # type: ignore
import pytest
import shapely.geometry # type: ignore

import bigframes.core.compile.ibis_types
import bigframes.dtypes
Expand Down Expand Up @@ -225,22 +224,6 @@ def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str):
assert result == ibis_dtype


@pytest.mark.parametrize(
["python_type", "expected_dtype"],
[
(bool, bigframes.dtypes.BOOL_DTYPE),
(int, bigframes.dtypes.INT_DTYPE),
(str, bigframes.dtypes.STRING_DTYPE),
(shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE),
(shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE),
(shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE),
],
)
def test_bigframes_type_supports_python_types(python_type, expected_dtype):
got_dtype = bigframes.dtypes.bigframes_type(python_type)
assert got_dtype == expected_dtype


def test_unsupported_dtype_raises_unexpected_datatype():
"""Incompatible dtypes should fail when passed into BigQuery DataFrames"""
with pytest.raises(ValueError, match="Datatype has no ibis type mapping"):
Expand All @@ -265,19 +248,3 @@ def test_literal_to_ibis_scalar_converts(literal, ibis_scalar):
assert bigframes.core.compile.ibis_types.literal_to_ibis_scalar(literal).equals(
ibis_scalar
)


@pytest.mark.parametrize(
["scalar", "expected_dtype"],
[
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
# Support NULL scalars.
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
],
)
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype
73 changes: 73 additions & 0 deletions tests/unit/test_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import db_dtypes
import pyarrow as pa # type: ignore
import pytest
import shapely.geometry # type: ignore

import bigframes.dtypes


@pytest.mark.parametrize(
["python_type", "expected_dtype"],
[
(bool, bigframes.dtypes.BOOL_DTYPE),
(int, bigframes.dtypes.INT_DTYPE),
(str, bigframes.dtypes.STRING_DTYPE),
(shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE),
(shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE),
(shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE),
],
)
def test_bigframes_type_supports_python_types(python_type, expected_dtype):
got_dtype = bigframes.dtypes.bigframes_type(python_type)
assert got_dtype == expected_dtype


@pytest.mark.parametrize(
["scalar", "expected_dtype"],
[
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
# Support NULL scalars.
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
],
)
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype


@pytest.mark.parametrize(
["type_", "expected"],
[
(pa.int64(), False),
(db_dtypes.JSONArrowType(), True),
(pa.struct([("int", pa.int64()), ("str", pa.string())]), False),
(pa.struct([("int", pa.int64()), ("json", db_dtypes.JSONArrowType())]), True),
(pa.list_(pa.int64()), False),
(pa.list_(db_dtypes.JSONArrowType()), True),
(
pa.list_(
pa.struct([("int", pa.int64()), ("json", db_dtypes.JSONArrowType())])
),
True,
),
],
)
def test_contains_db_dtypes_json_arrow_type(type_, expected):
assert bigframes.dtypes.contains_db_dtypes_json_arrow_type(type_) == expected
Loading