Skip to content

Commit 8cf1200

Browse files
400Pinggemini-code-assist[bot]
authored andcommitted
[Data] Compute Expressions-fixed size array (ray-project#58741)
## Description Completing the fixed-size array namespace operations ## Related issues Related to ray-project#58674 ## Additional information --------- Signed-off-by: 400Ping <fourhundredping@gmail.com> Signed-off-by: Ping <fourhundredping@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: peterxcli <peterxcli@gmail.com>
1 parent f91afc6 commit 8cf1200

File tree

3 files changed

+133
-2
lines changed

3 files changed

+133
-2
lines changed

python/ray/data/expressions.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from ray.util.annotations import DeveloperAPI, PublicAPI
2727

2828
if TYPE_CHECKING:
29+
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace
2930
from ray.data.namespace_expressions.dt_namespace import _DatetimeNamespace
3031
from ray.data.namespace_expressions.list_namespace import _ListNamespace
3132
from ray.data.namespace_expressions.string_namespace import _StringNamespace
@@ -577,12 +578,20 @@ def abs(self) -> "UDFExpr":
577578
"""
578579
return _create_pyarrow_compute_udf(pc.abs_checked)(self)
579580

581+
@property
582+
def arr(self) -> "_ArrayNamespace":
583+
"""Access array operations for this expression."""
584+
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace
585+
586+
return _ArrayNamespace(self)
587+
580588
@property
581589
def list(self) -> "_ListNamespace":
582590
"""Access list operations for this expression.
583591
584592
Returns:
585-
A _ListNamespace that provides list-specific operations.
593+
A _ListNamespace that provides list-specific operations for both
594+
PyArrow ``List`` and ``FixedSizeList`` columns.
586595
587596
Example:
588597
>>> from ray.data.expressions import col
@@ -1490,6 +1499,7 @@ def download(uri_column_name: str) -> DownloadExpr:
14901499
"lit",
14911500
"download",
14921501
"star",
1502+
"_ArrayNamespace",
14931503
"_ListNamespace",
14941504
"_StringNamespace",
14951505
"_StructNamespace",
@@ -1499,7 +1509,11 @@ def download(uri_column_name: str) -> DownloadExpr:
14991509

15001510
def __getattr__(name: str):
15011511
"""Lazy import of namespace classes to avoid circular imports."""
1502-
if name == "_ListNamespace":
1512+
if name == "_ArrayNamespace":
1513+
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace
1514+
1515+
return _ArrayNamespace
1516+
elif name == "_ListNamespace":
15031517
from ray.data.namespace_expressions.list_namespace import _ListNamespace
15041518

15051519
return _ListNamespace
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""Array namespace for expression operations on array-typed columns."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass
6+
from typing import TYPE_CHECKING
7+
8+
import pyarrow
9+
10+
from ray.data.datatype import DataType
11+
from ray.data.expressions import pyarrow_udf
12+
13+
if TYPE_CHECKING:
14+
from ray.data.expressions import Expr, UDFExpr
15+
16+
17+
@dataclass
18+
class _ArrayNamespace:
19+
"""Namespace for array operations on expression columns.
20+
21+
Example:
22+
>>> from ray.data.expressions import col
23+
>>> # Convert fixed-size lists to variable-length lists
24+
>>> expr = col("features").arr.to_list()
25+
"""
26+
27+
_expr: Expr
28+
29+
def to_list(self) -> "UDFExpr":
30+
"""Convert FixedSizeList columns into variable-length lists."""
31+
return_dtype = DataType(object)
32+
33+
expr_dtype = self._expr.data_type
34+
if expr_dtype.is_list_type():
35+
arrow_type = expr_dtype.to_arrow_dtype()
36+
if pyarrow.types.is_fixed_size_list(arrow_type):
37+
return_dtype = DataType.from_arrow(pyarrow.list_(arrow_type.value_type))
38+
else:
39+
return_dtype = expr_dtype
40+
41+
@pyarrow_udf(return_dtype=return_dtype)
42+
def _to_list(arr: pyarrow.Array) -> pyarrow.Array:
43+
arr_dtype = DataType.from_arrow(arr.type)
44+
if not arr_dtype.is_list_type():
45+
raise pyarrow.lib.ArrowInvalid(
46+
"to_list() can only be called on list-like columns, "
47+
f"but got {arr.type}"
48+
)
49+
50+
if isinstance(arr.type, pyarrow.FixedSizeListType):
51+
return arr.cast(pyarrow.list_(arr.type.value_type))
52+
return arr
53+
54+
return _to_list(self._expr)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Integration tests for array namespace expressions.
2+
3+
These tests require Ray and test end-to-end array namespace expression evaluation.
4+
"""
5+
6+
import pandas as pd
7+
import pyarrow as pa
8+
import pytest
9+
from packaging import version
10+
11+
import ray
12+
from ray.data._internal.util import rows_same
13+
from ray.data.expressions import col
14+
from ray.data.tests.conftest import * # noqa
15+
from ray.tests.conftest import * # noqa
16+
17+
pytestmark = pytest.mark.skipif(
18+
version.parse(pa.__version__) < version.parse("19.0.0"),
19+
reason="Namespace expressions tests require PyArrow >= 19.0",
20+
)
21+
22+
23+
def _make_fixed_size_list_table() -> pa.Table:
24+
values = pa.array([1, 2, 3, 4, 5, 6], type=pa.int64())
25+
fixed = pa.FixedSizeListArray.from_arrays(values, list_size=2)
26+
return pa.Table.from_arrays([fixed], names=["features"])
27+
28+
29+
def test_arr_to_list_fixed_size(ray_start_regular_shared):
30+
table = _make_fixed_size_list_table()
31+
ds = ray.data.from_arrow(table)
32+
33+
result = (
34+
ds.with_column("features", col("features").arr.to_list())
35+
.select_columns(["features"])
36+
.to_pandas()
37+
)
38+
expected = pd.DataFrame(
39+
[
40+
{"features": [1, 2]},
41+
{"features": [3, 4]},
42+
{"features": [5, 6]},
43+
]
44+
)
45+
46+
assert rows_same(result, expected)
47+
48+
49+
def test_arr_to_list_invalid_dtype_raises(ray_start_regular_shared):
50+
ds = ray.data.from_items([{"value": 1}, {"value": 2}])
51+
52+
with pytest.raises(
53+
(ray.exceptions.RayTaskError, ray.exceptions.UserCodeException)
54+
) as exc_info:
55+
ds.with_column("value_list", col("value").arr.to_list()).to_pandas()
56+
57+
assert "to_list() can only be called on list-like columns" in str(exc_info.value)
58+
59+
60+
if __name__ == "__main__":
61+
import sys
62+
63+
sys.exit(pytest.main(["-v", __file__]))

0 commit comments

Comments
 (0)