diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 81ff447..0c2dca0 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -76,7 +76,7 @@ jobs: python -m pip install nox - name: Run compliance tests env: - COVERAGE_FILE: .coverage-${{ matrix.python }} + COVERAGE_FILE: .coverage-compliance-${{ matrix.python }} run: | nox -s compliance-${{ matrix.python }} - name: Upload coverage results diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index ad4ea33..d27e93e 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -43,7 +43,14 @@ # nanosecond precision when boxing scalars. _NP_BOX_DTYPE = "datetime64[us]" -pandas_release = packaging.version.parse(pandas.__version__).release + +# To use JSONArray and JSONDtype, you'll need Pandas 1.5.0 or later. With the removal +# of Python 3.7 compatibility, the minimum Pandas version will be updated to 1.5.0. +if packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0"): + from db_dtypes.json import JSONArray, JSONDtype +else: + JSONArray = None + JSONDtype = None @pandas.api.extensions.register_extension_dtype @@ -337,10 +344,21 @@ def __sub__(self, other): return super().__sub__(other) -__all__ = [ - "__version__", - "DateArray", - "DateDtype", - "TimeArray", - "TimeDtype", -] +if not JSONArray or not JSONDtype: + __all__ = [ + "__version__", + "DateArray", + "DateDtype", + "TimeArray", + "TimeDtype", + ] +else: + __all__ = [ + "__version__", + "DateArray", + "DateDtype", + "JSONDtype", + "JSONArray", + "TimeArray", + "TimeDtype", + ] diff --git a/db_dtypes/json.py b/db_dtypes/json.py new file mode 100644 index 0000000..ed04b72 --- /dev/null +++ b/db_dtypes/json.py @@ -0,0 +1,209 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json + +import numpy as np +import pandas as pd +import pandas.arrays as arrays +import pandas.core.dtypes.common as common +import pandas.core.indexers as indexers +import pyarrow as pa +import pyarrow.compute + + +@pd.api.extensions.register_extension_dtype +class JSONDtype(pd.api.extensions.ExtensionDtype): + """Extension dtype for BigQuery JSON data.""" + + name = "dbjson" + + @property + def na_value(self) -> pd.NA: + """Default NA value to use for this type.""" + return pd.NA + + @property + def type(self) -> type[str]: + """ + Return the scalar type for the array elements. + The standard JSON data types can be one of `dict`, `list`, `str`, `int`, `float`, + `bool` and `None`. However, this method returns a `str` type to indicate its + storage type, because the union of multiple types are not supported well in pandas. + """ + return str + + @property + def pyarrow_dtype(self): + """Return the pyarrow data type used for storing data in the pyarrow array.""" + return pa.string() + + @property + def _is_numeric(self) -> bool: + return False + + @property + def _is_boolean(self) -> bool: + return False + + @classmethod + def construct_array_type(cls): + """Return the array type associated with this dtype.""" + return JSONArray + + +class JSONArray(arrays.ArrowExtensionArray): + """Extension array that handles BigQuery JSON data, leveraging a string-based + pyarrow array for storage. It enables seamless conversion to JSON objects when + accessing individual elements.""" + + _dtype = JSONDtype() + + def __init__(self, values, dtype=None, copy=False) -> None: + self._dtype = JSONDtype() + if isinstance(values, pa.Array): + self._pa_array = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._pa_array = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for JSONArray") + + @classmethod + def _box_pa( + cls, value, pa_type: pa.DataType | None = None + ) -> pa.Array | pa.ChunkedArray | pa.Scalar: + """Box value into a pyarrow Array, ChunkedArray or Scalar.""" + assert pa_type is None or pa_type == cls._dtype.pyarrow_dtype + + if isinstance(value, pa.Scalar) or not ( + common.is_list_like(value) and not common.is_dict_like(value) + ): + return cls._box_pa_scalar(value) + return cls._box_pa_array(value) + + @classmethod + def _box_pa_scalar(cls, value) -> pa.Scalar: + """Box value into a pyarrow Scalar.""" + if pd.isna(value): + pa_scalar = pa.scalar(None, type=cls._dtype.pyarrow_dtype) + else: + value = JSONArray._serialize_json(value) + pa_scalar = pa.scalar( + value, type=cls._dtype.pyarrow_dtype, from_pandas=True + ) + + return pa_scalar + + @classmethod + def _box_pa_array(cls, value, copy: bool = False) -> pa.Array | pa.ChunkedArray: + """Box value into a pyarrow Array or ChunkedArray.""" + if isinstance(value, cls): + pa_array = value._pa_array + else: + value = [JSONArray._serialize_json(x) for x in value] + pa_array = pa.array(value, type=cls._dtype.pyarrow_dtype, from_pandas=True) + return pa_array + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + """Construct a new ExtensionArray from a sequence of scalars.""" + pa_array = cls._box_pa(scalars) + arr = cls(pa_array) + return arr + + @staticmethod + def _serialize_json(value): + """A static method that converts a JSON value into a string representation.""" + if not common.is_list_like(value) and pd.isna(value): + return value + else: + # `sort_keys=True` sorts dictionary keys before serialization, making + # JSON comparisons deterministic. + return json.dumps(value, sort_keys=True) + + @staticmethod + def _deserialize_json(value): + """A static method that converts a JSON string back into its original value.""" + if not pd.isna(value): + return json.loads(value) + else: + return value + + @property + def dtype(self) -> JSONDtype: + """An instance of JSONDtype""" + return self._dtype + + def _cmp_method(self, other, op): + if op.__name__ == "eq": + result = pyarrow.compute.equal(self._pa_array, self._box_pa(other)) + elif op.__name__ == "ne": + result = pyarrow.compute.not_equal(self._pa_array, self._box_pa(other)) + else: + # Comparison is not a meaningful one. We don't want to support sorting by JSON columns. + raise TypeError(f"{op.__name__} not supported for JSONArray") + return arrays.ArrowExtensionArray(result) + + def __getitem__(self, item): + """Select a subset of self.""" + item = indexers.check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + return type(self)(pa.chunked_array([], type=self.dtype.pyarrow_dtype)) + elif item.dtype.kind in "iu": + return self.take(item) + else: + # `check_array_indexer` should verify that the assertion hold true. + assert item.dtype.kind == "b" + return type(self)(self._pa_array.filter(item)) + elif isinstance(item, tuple): + item = indexers.unpack_tuple_and_ellipses(item) + + if common.is_scalar(item) and not common.is_integer(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + + value = self._pa_array[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + scalar = JSONArray._deserialize_json(value.as_py()) + if scalar is None: + return self._dtype.na_value + else: + return scalar + + def __iter__(self): + """Iterate over elements of the array.""" + for value in self._pa_array: + val = JSONArray._deserialize_json(value.as_py()) + if val is None: + yield self._dtype.na_value + else: + yield val + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + """Return a scalar result of performing the reduction operation.""" + if name in ["min", "max"]: + raise TypeError("JSONArray does not support min/max reducntion.") + super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py new file mode 100644 index 0000000..74870c4 --- /dev/null +++ b/tests/compliance/json/conftest.py @@ -0,0 +1,181 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import random + +import numpy as np +import pandas as pd +import pytest + +from db_dtypes import JSONArray, JSONDtype + + +def make_data(): + # Since the `np.array` constructor needs a consistent shape after the first + # dimension, the samples data in this instance doesn't include the array type. + samples = [ + True, # Boolean + 100, # Int + 0.98, # Float + "str", # String + {"bool_value": True}, # Dict with a boolean + {"float_num": 3.14159}, # Dict with a float + {"date": "2024-07-16"}, # Dict with a date (as strings) + {"null_field": None}, # Dict with a null + {"list_data": [10, 20, 30]}, # Dict with a list + {"person": {"name": "Alice", "age": 35}}, # Dict with nested objects + {"address": {"street": "123 Main St", "city": "Anytown"}}, + {"order": {"items": ["book", "pen"], "total": 15.99}}, + ] + data = np.random.default_rng(2).choice(samples, size=100) + # This replaces a single data item with an array. We are skipping the first two + # items to avoid some `setitem` tests failed, because setting with a list is + # ambiguity in this context. + id = random.randint(3, 99) + data[id] = [0.1, 0.2] # Array + return data + + +@pytest.fixture +def dtype(): + return JSONDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + data = make_data() + + return JSONArray._from_sequence(data) + + +@pytest.fixture +def data_for_twos(dtype): + """ + Length-100 array in which all the elements are two. + + Call pytest.skip in your fixture if the dtype does not support divmod. + """ + pytest.skip(f"{dtype} is not a numeric dtype") + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return JSONArray._from_sequence([None, {"a": 10}]) + + +@pytest.fixture +def data_missing_for_sorting(): + return JSONArray._from_sequence([json.dumps({"b": 1}), None, json.dumps({"a": 4})]) + + +@pytest.fixture +def na_cmp(): + """ + Binary operator for comparing NA values. + + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + + By default, uses ``operator.is_`` + """ + + def cmp(a, b): + return lambda left, right: pd.isna(left) and pd.isna(right) + + return cmp + + +@pytest.fixture +def data_repeated(data): + """ + Generate many datasets. + + Parameters + ---------- + data : fixture implementing `data` + + Returns + ------- + Callable[[int], Generator]: + A callable that takes a `count` argument and + returns a generator yielding `count` datasets. + """ + + def gen(count): + for _ in range(count): + yield data + + return gen + + +_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] + + +@pytest.fixture(params=_all_numeric_accumulations) +def all_numeric_accumulations(request): + """ + Fixture for numeric accumulation names + """ + return request.param + + +_all_boolean_reductions = ["all", "any"] + + +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request): + """ + Fixture for boolean reduction names. + """ + return request.param + + +_all_numeric_reductions = [ + "count", + "sum", + "max", + "min", + "mean", + "prod", + "std", + "var", + "median", + "kurt", + "skew", + "sem", +] + + +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): + """ + Fixture for numeric reduction names. + """ + return request.param + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture returning 'data' or 'data_missing' integer arrays. + + Used to test dtype conversion with and without missing values. + """ + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py new file mode 100644 index 0000000..18610a0 --- /dev/null +++ b/tests/compliance/json/test_json_compliance.py @@ -0,0 +1,361 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import typing + +import numpy as np +import pandas as pd +import pandas._testing as tm +import pandas.tests.extension.base as base +import pytest + + +class TestJSONArrayAccumulate(base.BaseAccumulateTests): + pass + + +class TestJSONArrayCasting(base.BaseCastingTests): + def test_astype_str(self, data): + # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. + result = pd.Series(data[:5]).astype(str) + expected = pd.Series( + [json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "nullable_string_dtype", + [ + "string[python]", + "string[pyarrow]", + ], + ) + def test_astype_string(self, data, nullable_string_dtype): + # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. + result = pd.Series(data[:5]).astype(nullable_string_dtype) + expected = pd.Series( + [json.dumps(x, sort_keys=True) for x in data[:5]], + dtype=nullable_string_dtype, + ) + tm.assert_series_equal(result, expected) + + +class TestJSONArrayConstructors(base.BaseConstructorsTests): + def test_from_dtype(self, data): + # construct from our dtype & string dtype + dtype = data.dtype + + expected = pd.Series(data) + result = pd.Series(list(data), dtype=dtype) + tm.assert_series_equal(result, expected) + + result = pd.Series(list(data), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # Use `{"col1": data}` instead of passing `data` directly to the super method. + # This prevents the DataFrame constructor from attempting to interpret the + # dictionary as column headers. + + # gh-30280 + expected = pd.DataFrame({"col1": data}).astype(dtype) + result = pd.DataFrame({"col1": list(data)}, dtype=dtype) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame({"col1": list(data)}, dtype=str(dtype)) + tm.assert_frame_equal(result, expected) + + def test_series_constructor_scalar_with_index(self, data, dtype): + # Use json.dumps(data[0]) instead of passing data[0] directly to the super method. + # This prevents the Series constructor from attempting to interpret the dictionary + # as column headers. + scalar = json.dumps(data[0]) + result = pd.Series(scalar, index=[1, 2, 3], dtype=dtype) + expected = pd.Series([scalar] * 3, index=[1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = pd.Series(scalar, index=["foo"], dtype=dtype) + expected = pd.Series([scalar], index=["foo"], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.") +class TestJSONArrayGroupby(base.BaseGroupbyTests): + pass + + +class TestJSONArrayDtype(base.BaseDtypeTests): + pass + + +class TestJSONArrayGetitem(base.BaseGetitemTests): + @pytest.mark.xfail(reason="JSONDtype's type returns its storage type.") + def test_getitem_scalar(self, data): + """ + `_getitem_` can return any JSON-types objects while `data.dtype.type` returns + a string to indicate its storage type. + > assert isinstance(result, data.dtype.type) + E AssertionError + """ + super().test_getitem_scalar(data) + + +class TestJSONArrayIndex(base.BaseIndexTests): + pass + + +class TestJSONArrayInterface(base.BaseInterfaceTests): + def test_array_interface(self, data): + result = np.array(data) + # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method. + assert result[0] == json.dumps(data[0]) + + result = np.array(data, dtype=object) + # Use `json.dumps(x)` instead of passing `x` directly to the super method. + expected = np.array([json.dumps(x) for x in data], dtype=object) + # if expected.ndim > 1: + # # nested data, explicitly construct as 1D + # expected = construct_1d_object_array_from_listlike(list(data)) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_view(self, data): + super().test_view(data) + + +class TestJSONArrayParsing(base.BaseParsingTests): + @pytest.mark.xfail(reason="data type 'json' not understood") + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + +class TestJSONArrayMethods(base.BaseMethodsTests): + @pytest.mark.xfail(reason="Unhashable") + def test_value_counts_with_normalize(self, data): + super().test_value_counts_with_normalize(data) + + @pytest.mark.skip("fill-value is interpreted as a dict of values") + def test_fillna_copy_frame(self, data_missing): + super().test_fillna_copy_frame(data_missing) + + @pytest.mark.xfail(reason="combine for JSONArray not supported") + def test_combine_le(self, data_repeated): + super().test_combine_le(data_repeated) + + @pytest.mark.skip(reason="'<' not supported between instances of 'dict' and 'dict'") + def test_searchsorted(self, data_for_sorting, as_series): + super().test_searchsorted(self, data_for_sorting, as_series) + + @pytest.mark.xfail( + reason="`to_numpy` returns serialized JSON, " + + "while `__getitem__` returns JSON objects." + ) + def test_where_series(self, data, na_value, as_frame): + # `Series.where` calls `to_numpy` to get results. + super().test_where_series(data, na_value, as_frame) + + @pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.") + def test_factorize(self, data_for_grouping): + super().test_factorize(data_for_grouping) + + @pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.") + def test_factorize_equivalence(self, data_for_grouping): + super().test_factorize_equivalence(data_for_grouping) + + @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.") + def test_argsort(self, data_for_sorting): + super().test_argsort(data_for_sorting) + + @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.") + def test_argmin_argmax(self, data_for_sorting): + super().test_argmin_argmax(data_for_sorting) + + @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.") + def test_sort_values(self, data_for_sorting): + super().test_sort_values(data_for_sorting) + + @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.") + def test_sort_values_frame(self, data_for_sorting): + super().test_sort_values_frame(data_for_sorting) + + +class TestJSONArrayMissing(base.BaseMissingTests): + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_series(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_series() + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_frame(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_frame() + + +@pytest.mark.skip(reason="BigQuery JSON does not allow Arithmetic Ops.") +class TestJSONArrayArithmeticOps(base.BaseArithmeticOpsTests): + pass + + +class TestJSONArrayComparisonOps(base.BaseComparisonOpsTests): + def test_compare_array(self, data, comparison_op, request): + if comparison_op.__name__ not in ["eq", "ne"]: + mark = pytest.mark.xfail(reason="Comparison methods not implemented") + request.applymarker(mark) + super().test_compare_array(data, comparison_op) + + def test_compare_scalar(self, data, comparison_op, request): + if comparison_op.__name__ not in ["eq", "ne"]: + mark = pytest.mark.xfail(reason="Comparison methods not implemented") + request.applymarker(mark) + super().test_compare_scalar(data, comparison_op) + + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): + dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj)) + if op_name in ["__add__", "__radd__"]: + cast_to = dtype + else: + cast_to = "boolean[pyarrow]" # type: ignore[assignment] + return pointwise_result.astype(cast_to) + + +class TestJSONArrayUnaryOps(base.BaseUnaryOpsTests): + pass + + +class TestJSONArrayPrinting(base.BasePrintingTests): + pass + + +class TestJSONArrayReduce(base.BaseReduceTests): + pass + + +class TestJSONArrayReshaping(base.BaseReshapingTests): + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_transpose(self, data): + super().test_transpose(data) + + @pytest.mark.xfail( + reason="`to_numpy` returns serialized JSON, " + + "while `__getitem__` returns JSON objects." + ) + def test_transpose_frame(self, data): + # `DataFrame.T` calls `to_numpy` to get results. + super().test_transpose_frame(data) + + +class TestJSONArraySetitem(base.BaseSetitemTests): + # Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals + # has trouble setting sequences of values into scalar positions. + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series): + arr = data[:5].copy() + expected = data.take([0, 0, 0, 3, 4]) + + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + + # Use `[arr[0]] * len()` instead of passing `arr[0]` directly to the super method. + arr[idx] = [arr[0]] * len(arr[idx]) + tm.assert_equal(arr, expected) + + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array", "boolean-array-na"], + ) + def test_setitem_mask(self, data, mask, box_in_series): + arr = data[:5].copy() + expected = arr.take([0, 0, 0, 3, 4]) + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method. + arr[mask] = [data[0]] * len(arr[mask]) + tm.assert_equal(expected, arr) + + def test_setitem_loc_iloc_slice(self, data): + arr = data[:5].copy() + s = pd.Series(arr, index=["a", "b", "c", "d", "e"]) + expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index) + + result = s.copy() + # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method. + result.iloc[:3] = [data[0]] * len(result.iloc[:3]) + tm.assert_equal(result, expected) + + result = s.copy() + result.loc[:"c"] = [data[0]] * len(result.loc[:"c"]) + tm.assert_equal(result, expected) + + def test_setitem_slice(self, data, box_in_series): + arr = data[:5].copy() + expected = data.take([0, 0, 0, 3, 4]) + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + + # Use `[data[0]] * 3` instead of passing `data[0]` directly to the super method. + arr[:3] = [data[0]] * 3 + tm.assert_equal(arr, expected) + + @pytest.mark.xfail(reason="only integer scalar arrays can be converted") + def test_setitem_2d_values(self, data): + super().test_setitem_2d_values(data) + + @pytest.mark.xfail( + reason="`to_numpy` returns serialized JSON, " + + "while `__getitem__` returns JSON objects." + ) + def test_setitem_frame_2d_values(self, data): + super().test_setitem_frame_2d_values(data) + + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_broadcast(self, data, setter): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True + + if setter: # loc + target = getattr(ser, setter) + else: # __setitem__ + target = ser + + # Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method. + target[mask] = [data[10]] * len(target[mask]) + assert ser[0] == data[10] + assert ser[1] == data[10] + + @pytest.mark.xfail(reason="eq not implemented for ") + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): + super().test_setitem_mask_boolean_array_with_na(data, box_in_series) + + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_setitem_preserves_views(self, data): + super().test_setitem_preserves_views(data) + + +class TestJSONArrayDim2Compat(base.Dim2CompatTests): + pass diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py new file mode 100644 index 0000000..c48635d --- /dev/null +++ b/tests/unit/test_json.py @@ -0,0 +1,95 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json + +import pandas as pd +import pytest + +import db_dtypes + +# Check for minimum Pandas version. +pytest.importorskip("pandas", minversion="1.5.0") + + +# Python data types mirroring all standard JSON types: +# https://json-schema.org/understanding-json-schema/reference/type +JSON_DATA = { + "boolean": True, + "int": 100, + "float": 0.98, + "string": "hello world", + "array": [0.1, 0.2], + "dict": { + "null_field": None, + "order": { + "items": ["book", "pen", "computer"], + "total": 15.99, + "address": {"street": "123 Main St", "city": "Anytown"}, + }, + }, + "null": None, +} + + +def test_construct_w_unspported_types(): + with pytest.raises(ValueError): + db_dtypes.JSONArray(100) + + +def test_getitems_return_json_objects(): + data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values()) + for id, key in enumerate(JSON_DATA.keys()): + if key == "null": + assert pd.isna(data[id]) + else: + assert data[id] == JSON_DATA[key] + + +def test_getitems_w_unboxed_dict(): + data = db_dtypes.JSONArray._from_sequence([JSON_DATA["dict"]]) + assert len(data[0]) == 2 + + assert data[0]["null_field"] is None + assert data[0]["order"]["address"]["city"] == "Anytown" + assert len(data[0]["order"]["items"]) == 3 + assert data[0]["order"]["items"][0] == "book" + + with pytest.raises(KeyError): + data[0]["unknown"] + + +def test_getitems_when_iter_with_null(): + data = db_dtypes.JSONArray._from_sequence([JSON_DATA["null"]]) + s = pd.Series(data) + result = s[:1].item() + assert pd.isna(result) + + +def test_to_numpy(): + s = pd.Series(db_dtypes.JSONArray._from_sequence(JSON_DATA.values())) + data = s.to_numpy() + for id, key in enumerate(JSON_DATA.keys()): + if key == "null": + assert pd.isna(data[id]) + else: + assert data[id] == json.dumps(JSON_DATA[key], sort_keys=True) + + +def test_deterministic_json_serialization(): + x = {"a": 0, "b": 1} + y = {"b": 1, "a": 0} + data = db_dtypes.JSONArray._from_sequence([x]) + assert y in data