Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c2b03a8
[data]Compute Expressions-fixed-size-array-operations
400Ping Nov 18, 2025
b1615a8
fix prpperty
400Ping Nov 18, 2025
950ef28
add type-checking
400Ping Nov 18, 2025
7819f4b
[data] Improve array namespace docs and tests
400Ping Nov 21, 2025
0bc9400
fix linter
400Ping Nov 21, 2025
7a51d0b
fix
400Ping Nov 23, 2025
6cd7558
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 26, 2025
d666c74
fix
400Ping Nov 26, 2025
afb4571
fix linter
400Ping Nov 26, 2025
2957754
fix test
400Ping Nov 26, 2025
040877a
fix test_namespace_expressions
400Ping Nov 27, 2025
0fd0ff4
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 27, 2025
24f7eaf
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 29, 2025
c9b92a6
fix
400Ping Dec 1, 2025
804c228
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 1, 2025
a7e4520
Update python/ray/data/namespace_expressions/arr_namespace.py
400Ping Dec 1, 2025
70dbbd0
fix format
400Ping Dec 1, 2025
7a6a2fb
fix
400Ping Dec 5, 2025
244e2b0
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 10, 2025
2829610
fix
400Ping Dec 12, 2025
4717267
update
400Ping Dec 17, 2025
bb6fb05
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 17, 2025
c72eb5b
remove flatten()
400Ping Dec 31, 2025
71e4784
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 31, 2025
c46d1fa
Merge remote-tracking branch 'upstream/master' into data/compute-expr…
400Ping Jan 9, 2026
c3d3bb2
fix CI error
400Ping Jan 9, 2026
aa9ded9
update
400Ping Jan 14, 2026
2a283d8
fix
400Ping Jan 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions python/ray/data/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ray.util.annotations import DeveloperAPI, PublicAPI

if TYPE_CHECKING:
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace
from ray.data.namespace_expressions.dt_namespace import _DatetimeNamespace
from ray.data.namespace_expressions.list_namespace import _ListNamespace
from ray.data.namespace_expressions.string_namespace import _StringNamespace
Expand Down Expand Up @@ -577,12 +578,20 @@ def abs(self) -> "UDFExpr":
"""
return _create_pyarrow_compute_udf(pc.abs_checked)(self)

@property
def arr(self) -> "_ArrayNamespace":
"""Access array operations for this expression."""
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace

return _ArrayNamespace(self)

@property
def list(self) -> "_ListNamespace":
"""Access list operations for this expression.

Returns:
A _ListNamespace that provides list-specific operations.
A _ListNamespace that provides list-specific operations for both
PyArrow ``List`` and ``FixedSizeList`` columns.

Example:
>>> from ray.data.expressions import col
Expand Down Expand Up @@ -1490,6 +1499,7 @@ def download(uri_column_name: str) -> DownloadExpr:
"lit",
"download",
"star",
"_ArrayNamespace",
"_ListNamespace",
"_StringNamespace",
"_StructNamespace",
Expand All @@ -1499,7 +1509,11 @@ def download(uri_column_name: str) -> DownloadExpr:

def __getattr__(name: str):
"""Lazy import of namespace classes to avoid circular imports."""
if name == "_ListNamespace":
if name == "_ArrayNamespace":
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace

return _ArrayNamespace
elif name == "_ListNamespace":
from ray.data.namespace_expressions.list_namespace import _ListNamespace

return _ListNamespace
Expand Down
54 changes: 54 additions & 0 deletions python/ray/data/namespace_expressions/arr_namespace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Array namespace for expression operations on array-typed columns."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import pyarrow

from ray.data.datatype import DataType
from ray.data.expressions import pyarrow_udf

if TYPE_CHECKING:
from ray.data.expressions import Expr, UDFExpr


@dataclass
class _ArrayNamespace:
"""Namespace for array operations on expression columns.

Example:
>>> from ray.data.expressions import col
>>> # Convert fixed-size lists to variable-length lists
>>> expr = col("features").arr.to_list()
"""

_expr: Expr

def to_list(self) -> "UDFExpr":
"""Convert FixedSizeList columns into variable-length lists."""
return_dtype = DataType(object)

expr_dtype = self._expr.data_type
if expr_dtype.is_list_type():
arrow_type = expr_dtype.to_arrow_dtype()
if pyarrow.types.is_fixed_size_list(arrow_type):
return_dtype = DataType.from_arrow(pyarrow.list_(arrow_type.value_type))
else:
return_dtype = expr_dtype

@pyarrow_udf(return_dtype=return_dtype)
def _to_list(arr: pyarrow.Array) -> pyarrow.Array:
arr_dtype = DataType.from_arrow(arr.type)
if not arr_dtype.is_list_type():
raise pyarrow.lib.ArrowInvalid(
"to_list() can only be called on list-like columns, "
f"but got {arr.type}"
)

if isinstance(arr.type, pyarrow.FixedSizeListType):
return arr.cast(pyarrow.list_(arr.type.value_type))
return arr

return _to_list(self._expr)
63 changes: 63 additions & 0 deletions python/ray/data/tests/expressions/test_namespace_arr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Integration tests for array namespace expressions.

These tests require Ray and test end-to-end array namespace expression evaluation.
"""

import pandas as pd
import pyarrow as pa
import pytest
from packaging import version

import ray
from ray.data._internal.util import rows_same
from ray.data.expressions import col
from ray.data.tests.conftest import * # noqa
from ray.tests.conftest import * # noqa

pytestmark = pytest.mark.skipif(
version.parse(pa.__version__) < version.parse("19.0.0"),
reason="Namespace expressions tests require PyArrow >= 19.0",
)


def _make_fixed_size_list_table() -> pa.Table:
values = pa.array([1, 2, 3, 4, 5, 6], type=pa.int64())
fixed = pa.FixedSizeListArray.from_arrays(values, list_size=2)
return pa.Table.from_arrays([fixed], names=["features"])


def test_arr_to_list_fixed_size(ray_start_regular_shared):
table = _make_fixed_size_list_table()
ds = ray.data.from_arrow(table)

result = (
ds.with_column("features", col("features").arr.to_list())
.select_columns(["features"])
.to_pandas()
)
expected = pd.DataFrame(
[
{"features": [1, 2]},
{"features": [3, 4]},
{"features": [5, 6]},
]
)

assert rows_same(result, expected)


def test_arr_to_list_invalid_dtype_raises(ray_start_regular_shared):
ds = ray.data.from_items([{"value": 1}, {"value": 2}])

with pytest.raises(
(ray.exceptions.RayTaskError, ray.exceptions.UserCodeException)
) as exc_info:
ds.with_column("value_list", col("value").arr.to_list()).to_pandas()

assert "to_list() can only be called on list-like columns" in str(exc_info.value)


if __name__ == "__main__":
import sys

sys.exit(pytest.main(["-v", __file__]))