Skip to content

Commit eca22ee

Browse files
tswastchelsea-lin
andauthored
fix: only show JSON dtype warning when accessing dtypes directly (#2136)
* fix: only show JSON dtype warning when accessing dtypes directly * Update license year * fix mypy * fix unit tests --------- Co-authored-by: Chelsea Lin <[email protected]>
1 parent 67e46cd commit eca22ee

File tree

9 files changed

+167
-48
lines changed

9 files changed

+167
-48
lines changed

bigframes/core/array_value.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import functools
1919
import typing
2020
from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
21-
import warnings
2221

2322
import google.cloud.bigquery
2423
import pandas
@@ -37,7 +36,6 @@
3736
import bigframes.core.tree_properties
3837
from bigframes.core.window_spec import WindowSpec
3938
import bigframes.dtypes
40-
import bigframes.exceptions as bfe
4139
import bigframes.operations as ops
4240
import bigframes.operations.aggregations as agg_ops
4341

@@ -101,12 +99,6 @@ def from_table(
10199
):
102100
if offsets_col and primary_key:
103101
raise ValueError("must set at most one of 'offests', 'primary_key'")
104-
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
105-
msg = bfe.format_message(
106-
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
107-
"is a preview feature and subject to change."
108-
)
109-
warnings.warn(msg, bfe.PreviewWarning)
110102
# define data source only for needed columns, this makes row-hashing cheaper
111103
table_def = nodes.GbqTable.from_table(table, columns=schema.names)
112104

bigframes/core/backports.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Helpers for working across versions of different depenencies."""
16+
17+
from typing import List
18+
19+
import pyarrow
20+
21+
22+
def pyarrow_struct_type_fields(struct_type: pyarrow.StructType) -> List[pyarrow.Field]:
23+
"""StructType.fields was added in pyarrow 18.
24+
25+
See: https://arrow.apache.org/docs/18.0/python/generated/pyarrow.StructType.html
26+
"""
27+
28+
if hasattr(struct_type, "fields"):
29+
return struct_type.fields
30+
31+
return [
32+
struct_type.field(field_index) for field_index in range(struct_type.num_fields)
33+
]

bigframes/core/indexes/base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,12 +171,16 @@ def shape(self) -> typing.Tuple[int]:
171171

172172
@property
173173
def dtype(self):
174-
return self._block.index.dtypes[0] if self.nlevels == 1 else np.dtype("O")
174+
dtype = self._block.index.dtypes[0] if self.nlevels == 1 else np.dtype("O")
175+
bigframes.dtypes.warn_on_db_dtypes_json_dtype([dtype])
176+
return dtype
175177

176178
@property
177179
def dtypes(self) -> pandas.Series:
180+
dtypes = self._block.index.dtypes
181+
bigframes.dtypes.warn_on_db_dtypes_json_dtype(dtypes)
178182
return pandas.Series(
179-
data=self._block.index.dtypes,
183+
data=dtypes,
180184
index=typing.cast(typing.Tuple, self._block.index.names),
181185
)
182186

bigframes/dataframe.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,9 @@ def at(self) -> indexers.AtDataFrameIndexer:
321321

322322
@property
323323
def dtypes(self) -> pandas.Series:
324-
return pandas.Series(data=self._block.dtypes, index=self._block.column_labels)
324+
dtypes = self._block.dtypes
325+
bigframes.dtypes.warn_on_db_dtypes_json_dtype(dtypes)
326+
return pandas.Series(data=dtypes, index=self._block.column_labels)
325327

326328
@property
327329
def columns(self) -> pandas.Index:

bigframes/dtypes.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import textwrap
2121
import typing
2222
from typing import Any, Dict, List, Literal, Sequence, Union
23+
import warnings
2324

2425
import bigframes_vendored.constants as constants
2526
import db_dtypes # type: ignore
@@ -30,6 +31,9 @@
3031
import pyarrow as pa
3132
import shapely.geometry # type: ignore
3233

34+
import bigframes.core.backports
35+
import bigframes.exceptions
36+
3337
# Type hints for Pandas dtypes supported by BigQuery DataFrame
3438
Dtype = Union[
3539
pd.BooleanDtype,
@@ -62,7 +66,8 @@
6266
# No arrow equivalent
6367
GEO_DTYPE = gpd.array.GeometryDtype()
6468
# JSON
65-
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
69+
# TODO(https://github.com/pandas-dev/pandas/issues/60958): switch to
70+
# pyarrow.json_(pyarrow.string()) when pandas 3+ and pyarrow 18+ is installed.
6671
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
6772
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
6873
OBJ_REF_DTYPE = pd.ArrowDtype(
@@ -368,8 +373,7 @@ def get_struct_fields(type_: ExpressionType) -> dict[str, Dtype]:
368373
assert isinstance(type_.pyarrow_dtype, pa.StructType)
369374
struct_type = type_.pyarrow_dtype
370375
result: dict[str, Dtype] = {}
371-
for field_no in range(struct_type.num_fields):
372-
field = struct_type.field(field_no)
376+
for field in bigframes.core.backports.pyarrow_struct_type_fields(struct_type):
373377
result[field.name] = arrow_dtype_to_bigframes_dtype(field.type)
374378
return result
375379

@@ -547,7 +551,8 @@ def arrow_type_to_literal(
547551
return [arrow_type_to_literal(arrow_type.value_type)]
548552
if pa.types.is_struct(arrow_type):
549553
return {
550-
field.name: arrow_type_to_literal(field.type) for field in arrow_type.fields
554+
field.name: arrow_type_to_literal(field.type)
555+
for field in bigframes.core.backports.pyarrow_struct_type_fields(arrow_type)
551556
}
552557
if pa.types.is_string(arrow_type):
553558
return "string"
@@ -915,3 +920,40 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:
915920

916921

917922
TIMEDELTA_DESCRIPTION_TAG = "#microseconds"
923+
924+
925+
def contains_db_dtypes_json_arrow_type(type_):
926+
if isinstance(type_, db_dtypes.JSONArrowType):
927+
return True
928+
929+
if isinstance(type_, pa.ListType):
930+
return contains_db_dtypes_json_arrow_type(type_.value_type)
931+
932+
if isinstance(type_, pa.StructType):
933+
return any(
934+
contains_db_dtypes_json_arrow_type(field.type)
935+
for field in bigframes.core.backports.pyarrow_struct_type_fields(type_)
936+
)
937+
return False
938+
939+
940+
def contains_db_dtypes_json_dtype(dtype):
941+
if not isinstance(dtype, pd.ArrowDtype):
942+
return False
943+
944+
return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype)
945+
946+
947+
def warn_on_db_dtypes_json_dtype(dtypes):
948+
"""Warn that the JSON dtype is changing.
949+
950+
Note: only call this function if the user is explicitly checking the
951+
dtypes.
952+
"""
953+
if any(contains_db_dtypes_json_dtype(dtype) for dtype in dtypes):
954+
msg = bigframes.exceptions.format_message(
955+
"JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_()) "
956+
"instead of using `db_dtypes` in the future when available in pandas "
957+
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow."
958+
)
959+
warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)

bigframes/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ class FunctionAxisOnePreviewWarning(PreviewWarning):
111111
"""Remote Function and Managed UDF with axis=1 preview."""
112112

113113

114+
class JSONDtypeWarning(PreviewWarning):
115+
"""JSON dtype will be pd.ArrowDtype(pa.json_()) in the future."""
116+
117+
114118
class FunctionConflictTypeHintWarning(UserWarning):
115119
"""Conflicting type hints in a BigFrames function."""
116120

bigframes/series.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,12 @@ def dt(self) -> dt.DatetimeMethods:
113113

114114
@property
115115
def dtype(self):
116+
bigframes.dtypes.warn_on_db_dtypes_json_dtype([self._dtype])
116117
return self._dtype
117118

118119
@property
119120
def dtypes(self):
121+
bigframes.dtypes.warn_on_db_dtypes_json_dtype([self._dtype])
120122
return self._dtype
121123

122124
@property

tests/unit/core/test_dtypes.py renamed to tests/unit/core/test_ibis_types.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import pandas as pd
2121
import pyarrow as pa # type: ignore
2222
import pytest
23-
import shapely.geometry # type: ignore
2423

2524
import bigframes.core.compile.ibis_types
2625
import bigframes.dtypes
@@ -225,22 +224,6 @@ def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str):
225224
assert result == ibis_dtype
226225

227226

228-
@pytest.mark.parametrize(
229-
["python_type", "expected_dtype"],
230-
[
231-
(bool, bigframes.dtypes.BOOL_DTYPE),
232-
(int, bigframes.dtypes.INT_DTYPE),
233-
(str, bigframes.dtypes.STRING_DTYPE),
234-
(shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE),
235-
(shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE),
236-
(shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE),
237-
],
238-
)
239-
def test_bigframes_type_supports_python_types(python_type, expected_dtype):
240-
got_dtype = bigframes.dtypes.bigframes_type(python_type)
241-
assert got_dtype == expected_dtype
242-
243-
244227
def test_unsupported_dtype_raises_unexpected_datatype():
245228
"""Incompatible dtypes should fail when passed into BigQuery DataFrames"""
246229
with pytest.raises(ValueError, match="Datatype has no ibis type mapping"):
@@ -265,19 +248,3 @@ def test_literal_to_ibis_scalar_converts(literal, ibis_scalar):
265248
assert bigframes.core.compile.ibis_types.literal_to_ibis_scalar(literal).equals(
266249
ibis_scalar
267250
)
268-
269-
270-
@pytest.mark.parametrize(
271-
["scalar", "expected_dtype"],
272-
[
273-
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
274-
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
275-
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
276-
# Support NULL scalars.
277-
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
278-
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
279-
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
280-
],
281-
)
282-
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
283-
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype

tests/unit/test_dtypes.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import db_dtypes # type: ignore
16+
import pyarrow as pa # type: ignore
17+
import pytest
18+
import shapely.geometry # type: ignore
19+
20+
import bigframes.dtypes
21+
22+
23+
@pytest.mark.parametrize(
24+
["python_type", "expected_dtype"],
25+
[
26+
(bool, bigframes.dtypes.BOOL_DTYPE),
27+
(int, bigframes.dtypes.INT_DTYPE),
28+
(str, bigframes.dtypes.STRING_DTYPE),
29+
(shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE),
30+
(shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE),
31+
(shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE),
32+
],
33+
)
34+
def test_bigframes_type_supports_python_types(python_type, expected_dtype):
35+
got_dtype = bigframes.dtypes.bigframes_type(python_type)
36+
assert got_dtype == expected_dtype
37+
38+
39+
@pytest.mark.parametrize(
40+
["scalar", "expected_dtype"],
41+
[
42+
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
43+
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
44+
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
45+
# Support NULL scalars.
46+
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
47+
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
48+
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
49+
],
50+
)
51+
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
52+
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype
53+
54+
55+
@pytest.mark.parametrize(
56+
["type_", "expected"],
57+
[
58+
(pa.int64(), False),
59+
(db_dtypes.JSONArrowType(), True),
60+
(pa.struct([("int", pa.int64()), ("str", pa.string())]), False),
61+
(pa.struct([("int", pa.int64()), ("json", db_dtypes.JSONArrowType())]), True),
62+
(pa.list_(pa.int64()), False),
63+
(pa.list_(db_dtypes.JSONArrowType()), True),
64+
(
65+
pa.list_(
66+
pa.struct([("int", pa.int64()), ("json", db_dtypes.JSONArrowType())])
67+
),
68+
True,
69+
),
70+
],
71+
)
72+
def test_contains_db_dtypes_json_arrow_type(type_, expected):
73+
assert bigframes.dtypes.contains_db_dtypes_json_arrow_type(type_) == expected

0 commit comments

Comments
 (0)