Skip to content

feat: support astype conversions to and from JSON dtypes #1716

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,6 +1148,35 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
elif to_type == ibis_dtypes.time:
return x_converted.time()

if to_type == ibis_dtypes.json:
if x.type() == ibis_dtypes.string:
return parse_json_in_safe(x) if op.safe else parse_json(x)
if x.type() == ibis_dtypes.bool:
x_bool = typing.cast(
ibis_types.StringValue,
bigframes.core.compile.ibis_types.cast_ibis_value(
x, ibis_dtypes.string, safe=op.safe
),
).lower()
return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool)
if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64):
x_str = bigframes.core.compile.ibis_types.cast_ibis_value(
x, ibis_dtypes.string, safe=op.safe
)
return parse_json_in_safe(x_str) if op.safe else parse_json(x_str)

if x.type() == ibis_dtypes.json:
if to_type == ibis_dtypes.int64:
return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x)
if to_type == ibis_dtypes.float64:
return (
cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x)
)
if to_type == ibis_dtypes.bool:
return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x)
if to_type == ibis_dtypes.string:
return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x)

# TODO: either inline this function, or push rest of this op into the function
return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe)

Expand Down Expand Up @@ -2031,6 +2060,11 @@ def parse_json(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body]
"""Converts a JSON-formatted STRING value to a JSON value."""


@ibis_udf.scalar.builtin(name="SAFE.PARSE_JSON")
def parse_json_in_safe(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body]
"""Converts a JSON-formatted STRING value to a JSON value in the safe mode."""


@ibis_udf.scalar.builtin(name="json_set")
def json_set( # type: ignore[empty-body]
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String, json_value
Expand Down Expand Up @@ -2059,6 +2093,46 @@ def json_value( # type: ignore[empty-body]
"""Retrieve value of a JSON field as plain STRING."""


@ibis_udf.scalar.builtin(name="INT64")
def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body]
"""Converts a JSON number to a SQL INT64 value."""


@ibis_udf.scalar.builtin(name="SAFE.INT64")
def cast_json_to_int64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body]
"""Converts a JSON number to a SQL INT64 value in the safe mode."""


@ibis_udf.scalar.builtin(name="FLOAT64")
def cast_json_to_float64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body]
"""Attempts to convert a JSON value to a SQL FLOAT64 value."""


@ibis_udf.scalar.builtin(name="SAFE.FLOAT64")
def cast_json_to_float64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body]
"""Attempts to convert a JSON value to a SQL FLOAT64 value."""


@ibis_udf.scalar.builtin(name="BOOL")
def cast_json_to_bool(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body]
"""Attempts to convert a JSON value to a SQL BOOL value."""


@ibis_udf.scalar.builtin(name="SAFE.BOOL")
def cast_json_to_bool_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body]
"""Attempts to convert a JSON value to a SQL BOOL value."""


@ibis_udf.scalar.builtin(name="STRING")
def cast_json_to_string(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body]
"""Attempts to convert a JSON value to a SQL STRING value."""


@ibis_udf.scalar.builtin(name="SAFE.STRING")
def cast_json_to_string_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body]
"""Attempts to convert a JSON value to a SQL STRING value."""


@ibis_udf.scalar.builtin(name="ML.DISTANCE")
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: # type: ignore[empty-body]
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""
Expand Down
122 changes: 118 additions & 4 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
# limitations under the License.

import datetime as dt
import json
import math
import re
import tempfile

import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import google.api_core.exceptions
import numpy
from packaging.version import Version
import pandas as pd
Expand Down Expand Up @@ -3474,9 +3476,11 @@ def foo(x):
("int64_col", pd.ArrowDtype(pa.timestamp("us"))),
("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))),
("int64_col", "time64[us][pyarrow]"),
("int64_col", pd.ArrowDtype(db_dtypes.JSONArrowType())),
("bool_col", "Int64"),
("bool_col", "string[pyarrow]"),
("bool_col", "Float64"),
("bool_col", pd.ArrowDtype(db_dtypes.JSONArrowType())),
("string_col", "binary[pyarrow]"),
("bytes_col", "string[pyarrow]"),
# pandas actually doesn't let folks convert to/from naive timestamp and
Expand Down Expand Up @@ -3541,7 +3545,7 @@ def test_astype_safe(session):
pd.testing.assert_series_equal(result, exepcted)


def test_series_astype_error_error(session):
def test_series_astype_w_invalid_error(session):
input = pd.Series(["hello", "world", "3.11", "4000"])
with pytest.raises(ValueError):
session.read_pandas(input).astype("Float64", errors="bad_value")
Expand Down Expand Up @@ -3676,6 +3680,118 @@ def test_timestamp_astype_string():
assert bf_result.dtype == "string[pyarrow]"


@pytest.mark.parametrize("errors", ["raise", "null"])
def test_float_astype_json(errors):
data = ["1.25", "2500000000", None, "-12323.24"]
bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE)

bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors)
assert bf_result.dtype == dtypes.JSON_DTYPE

pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE)
pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)


@pytest.mark.parametrize("errors", ["raise", "null"])
def test_string_astype_json(errors):
data = [
"1",
None,
'["1","3","5"]',
'{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}',
]
bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE)

bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors)
assert bf_result.dtype == dtypes.JSON_DTYPE

pd_result = bf_series.to_pandas().astype(dtypes.JSON_DTYPE)
pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)


def test_string_astype_json_in_safe_mode():
data = ["this is not a valid json string"]
bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE)
bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null")
assert bf_result.dtype == dtypes.JSON_DTYPE

expected = pd.Series([None], dtype=dtypes.JSON_DTYPE)
expected.index = expected.index.astype("Int64")
pd.testing.assert_series_equal(bf_result.to_pandas(), expected)


def test_string_astype_json_raise_error():
data = ["this is not a valid json string"]
bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE)
with pytest.raises(
google.api_core.exceptions.BadRequest,
match="syntax error while parsing value",
):
bf_series.astype(dtypes.JSON_DTYPE, errors="raise").to_pandas()


@pytest.mark.parametrize("errors", ["raise", "null"])
@pytest.mark.parametrize(
("data", "to_type"),
[
pytest.param(["1", "10.0", None], dtypes.INT_DTYPE, id="to_int"),
pytest.param(["0.0001", "2500000000", None], dtypes.FLOAT_DTYPE, id="to_float"),
pytest.param(["true", "false", None], dtypes.BOOL_DTYPE, id="to_bool"),
pytest.param(['"str"', None], dtypes.STRING_DTYPE, id="to_string"),
pytest.param(
['"str"', None],
dtypes.TIME_DTYPE,
id="invalid",
marks=pytest.mark.xfail(raises=TypeError),
),
],
)
def test_json_astype_others(data, to_type, errors):
bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE)

bf_result = bf_series.astype(to_type, errors=errors)
assert bf_result.dtype == to_type

load_data = [json.loads(item) if item is not None else None for item in data]
expected = pd.Series(load_data, dtype=to_type)
expected.index = expected.index.astype("Int64")
pd.testing.assert_series_equal(bf_result.to_pandas(), expected)


@pytest.mark.parametrize(
("data", "to_type"),
[
pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"),
pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"),
pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"),
pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"),
],
)
def test_json_astype_others_raise_error(data, to_type):
bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE)
with pytest.raises(google.api_core.exceptions.BadRequest):
bf_series.astype(to_type, errors="raise").to_pandas()


@pytest.mark.parametrize(
("data", "to_type"),
[
pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"),
pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"),
pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"),
pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"),
],
)
def test_json_astype_others_in_safe_mode(data, to_type):
bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE)
bf_result = bf_series.astype(to_type, errors="null")
assert bf_result.dtype == to_type

expected = pd.Series([None, None], dtype=to_type)
expected.index = expected.index.astype("Int64")
pd.testing.assert_series_equal(bf_result.to_pandas(), expected)


@pytest.mark.parametrize(
"index",
[0, 5, -2],
Expand All @@ -3687,9 +3803,7 @@ def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index):
assert bf_result == pd_result


def test_iloc_single_integer_out_of_bound_error(
scalars_df_index, scalars_pandas_df_index
):
def test_iloc_single_integer_out_of_bound_error(scalars_df_index):
with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"):
scalars_df_index.string_col.iloc[99]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,12 @@ def __sql_name__(self, op: ops.ScalarUDF | ops.AggUDF) -> str:
# not actually a table, but easier to quote individual namespace
# components this way
namespace = op.__udf_namespace__

# Function names prefixed with "SAFE.", such as `SAFE.PARSE_JSON`,
# are typically not quoted.
if funcname.startswith("SAFE."):
return funcname

return sg.table(funcname, db=namespace.database, catalog=namespace.catalog).sql(
self.dialect
)
Expand Down