diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index e2dfa38ce1..e0a22b6952 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1148,6 +1148,35 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): elif to_type == ibis_dtypes.time: return x_converted.time() + if to_type == ibis_dtypes.json: + if x.type() == ibis_dtypes.string: + return parse_json_in_safe(x) if op.safe else parse_json(x) + if x.type() == ibis_dtypes.bool: + x_bool = typing.cast( + ibis_types.StringValue, + bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ), + ).lower() + return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool) + if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64): + x_str = bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ) + return parse_json_in_safe(x_str) if op.safe else parse_json(x_str) + + if x.type() == ibis_dtypes.json: + if to_type == ibis_dtypes.int64: + return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x) + if to_type == ibis_dtypes.float64: + return ( + cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) + ) + if to_type == ibis_dtypes.bool: + return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) + if to_type == ibis_dtypes.string: + return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) + # TODO: either inline this function, or push rest of this op into the function return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) @@ -2031,6 +2060,11 @@ def parse_json(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] """Converts a JSON-formatted STRING value to a JSON value.""" +@ibis_udf.scalar.builtin(name="SAFE.PARSE_JSON") +def parse_json_in_safe(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] + """Converts a JSON-formatted STRING value to a JSON value in the safe mode.""" + + @ibis_udf.scalar.builtin(name="json_set") def json_set( # type: ignore[empty-body] json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String, json_value @@ -2059,6 +2093,46 @@ def json_value( # type: ignore[empty-body] """Retrieve value of a JSON field as plain STRING.""" +@ibis_udf.scalar.builtin(name="INT64") +def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] + """Converts a JSON number to a SQL INT64 value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.INT64") +def cast_json_to_int64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] + """Converts a JSON number to a SQL INT64 value in the safe mode.""" + + +@ibis_udf.scalar.builtin(name="FLOAT64") +def cast_json_to_float64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL FLOAT64 value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.FLOAT64") +def cast_json_to_float64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL FLOAT64 value.""" + + +@ibis_udf.scalar.builtin(name="BOOL") +def cast_json_to_bool(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL BOOL value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.BOOL") +def cast_json_to_bool_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL BOOL value.""" + + +@ibis_udf.scalar.builtin(name="STRING") +def cast_json_to_string(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL STRING value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.STRING") +def cast_json_to_string_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL STRING value.""" + + @ibis_udf.scalar.builtin(name="ML.DISTANCE") def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: # type: ignore[empty-body] """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 3852c417fa..1a5a9a96c1 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -13,12 +13,14 @@ # limitations under the License. import datetime as dt +import json import math import re import tempfile import db_dtypes # type: ignore import geopandas as gpd # type: ignore +import google.api_core.exceptions import numpy from packaging.version import Version import pandas as pd @@ -3474,9 +3476,11 @@ def foo(x): ("int64_col", pd.ArrowDtype(pa.timestamp("us"))), ("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), ("int64_col", "time64[us][pyarrow]"), + ("int64_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), ("bool_col", "Float64"), + ("bool_col", pd.ArrowDtype(db_dtypes.JSONArrowType())), ("string_col", "binary[pyarrow]"), ("bytes_col", "string[pyarrow]"), # pandas actually doesn't let folks convert to/from naive timestamp and @@ -3541,7 +3545,7 @@ def test_astype_safe(session): pd.testing.assert_series_equal(result, exepcted) -def test_series_astype_error_error(session): +def test_series_astype_w_invalid_error(session): input = pd.Series(["hello", "world", "3.11", "4000"]) with pytest.raises(ValueError): session.read_pandas(input).astype("Float64", errors="bad_value") @@ -3676,6 +3680,118 @@ def test_timestamp_astype_string(): assert bf_result.dtype == "string[pyarrow]" +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_float_astype_json(errors): + data = ["1.25", "2500000000", None, "-12323.24"] + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +def test_string_astype_json(errors): + data = [ + "1", + None, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', + ] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) + assert bf_result.dtype == dtypes.JSON_DTYPE + + pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_string_astype_json_in_safe_mode(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null") + assert bf_result.dtype == dtypes.JSON_DTYPE + + expected = pd.Series([None], dtype=dtypes.JSON_DTYPE) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +def test_string_astype_json_raise_error(): + data = ["this is not a valid json string"] + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + with pytest.raises( + google.api_core.exceptions.BadRequest, + match="syntax error while parsing value", + ): + bf_series.astype(dtypes.JSON_DTYPE, errors="raise").to_pandas() + + +@pytest.mark.parametrize("errors", ["raise", "null"]) +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["1", "10.0", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["0.0001", "2500000000", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["true", "false", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(['"str"', None], dtypes.STRING_DTYPE, id="to_string"), + pytest.param( + ['"str"', None], + dtypes.TIME_DTYPE, + id="invalid", + marks=pytest.mark.xfail(raises=TypeError), + ), + ], +) +def test_json_astype_others(data, to_type, errors): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + + bf_result = bf_series.astype(to_type, errors=errors) + assert bf_result.dtype == to_type + + load_data = [json.loads(item) if item is not None else None for item in data] + expected = pd.Series(load_data, dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_raise_error(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + with pytest.raises(google.api_core.exceptions.BadRequest): + bf_series.astype(to_type, errors="raise").to_pandas() + + +@pytest.mark.parametrize( + ("data", "to_type"), + [ + pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), + pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), + pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), + ], +) +def test_json_astype_others_in_safe_mode(data, to_type): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) + bf_result = bf_series.astype(to_type, errors="null") + assert bf_result.dtype == to_type + + expected = pd.Series([None, None], dtype=to_type) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(bf_result.to_pandas(), expected) + + @pytest.mark.parametrize( "index", [0, 5, -2], @@ -3687,9 +3803,7 @@ def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): assert bf_result == pd_result -def test_iloc_single_integer_out_of_bound_error( - scalars_df_index, scalars_pandas_df_index -): +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): scalars_df_index.string_col.iloc[99] diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index 6e98d6a9e1..acccd7ea6c 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -1222,6 +1222,12 @@ def __sql_name__(self, op: ops.ScalarUDF | ops.AggUDF) -> str: # not actually a table, but easier to quote individual namespace # components this way namespace = op.__udf_namespace__ + + # Function names prefixed with "SAFE.", such as `SAFE.PARSE_JSON`, + # are typically not quoted. + if funcname.startswith("SAFE."): + return funcname + return sg.table(funcname, db=namespace.database, catalog=namespace.catalog).sql( self.dialect )