Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c2b03a8
[data]Compute Expressions-fixed-size-array-operations
400Ping Nov 18, 2025
b1615a8
fix prpperty
400Ping Nov 18, 2025
950ef28
add type-checking
400Ping Nov 18, 2025
7819f4b
[data] Improve array namespace docs and tests
400Ping Nov 21, 2025
0bc9400
fix linter
400Ping Nov 21, 2025
7a51d0b
fix
400Ping Nov 23, 2025
6cd7558
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 26, 2025
d666c74
fix
400Ping Nov 26, 2025
afb4571
fix linter
400Ping Nov 26, 2025
2957754
fix test
400Ping Nov 26, 2025
040877a
fix test_namespace_expressions
400Ping Nov 27, 2025
0fd0ff4
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 27, 2025
24f7eaf
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 29, 2025
c9b92a6
fix
400Ping Dec 1, 2025
804c228
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 1, 2025
a7e4520
Update python/ray/data/namespace_expressions/arr_namespace.py
400Ping Dec 1, 2025
70dbbd0
fix format
400Ping Dec 1, 2025
7a6a2fb
fix
400Ping Dec 5, 2025
244e2b0
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 10, 2025
2829610
fix
400Ping Dec 12, 2025
4717267
update
400Ping Dec 17, 2025
bb6fb05
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 17, 2025
c72eb5b
remove flatten()
400Ping Dec 31, 2025
71e4784
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 31, 2025
c46d1fa
Merge remote-tracking branch 'upstream/master' into data/compute-expr…
400Ping Jan 9, 2026
c3d3bb2
fix CI error
400Ping Jan 9, 2026
aa9ded9
update
400Ping Jan 14, 2026
2a283d8
fix
400Ping Jan 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions python/ray/data/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ray.util.annotations import DeveloperAPI, PublicAPI

if TYPE_CHECKING:
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace
from ray.data.namespace_expressions.list_namespace import _ListNamespace
from ray.data.namespace_expressions.string_namespace import _StringNamespace
from ray.data.namespace_expressions.struct_namespace import _StructNamespace
Expand Down Expand Up @@ -486,6 +487,20 @@ def struct(self) -> "_StructNamespace":

return _StructNamespace(self)

@property
def arr(self) -> "_ArrayNamespace":
"""Access fixed-size array operations for this expression.

Example
-------
>>> from ray.data.expressions import col
>>> # Assume ``features`` is a FixedSizeList column
>>> expr = col("features").arr.flatten()
"""
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace

return _ArrayNamespace(self)

def _unalias(self) -> "Expr":
return self

Expand Down Expand Up @@ -1061,6 +1076,7 @@ def download(uri_column_name: str) -> DownloadExpr:
"_ListNamespace",
"_StringNamespace",
"_StructNamespace",
"_ArrayNamespace",
]


Expand All @@ -1078,4 +1094,8 @@ def __getattr__(name: str):
from ray.data.namespace_expressions.struct_namespace import _StructNamespace

return _StructNamespace
elif name == "_ArrayNamespace":
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace

return _ArrayNamespace
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
99 changes: 99 additions & 0 deletions python/ray/data/namespace_expressions/arr_namespace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
Fixed-size array namespace for expression operations.

This namespace handles Arrow ``FixedSizeListArray`` columns. It provides
helper methods for flattening nested arrays and converting them into
Python lists.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import pyarrow
import pyarrow.compute as pc

from ray.data.datatype import DataType
from ray.data.expressions import pyarrow_udf

if TYPE_CHECKING:
from ray.data.expressions import Expr, UDFExpr


def _fixed_size_list_to_list_array(arr: pyarrow.Array) -> pyarrow.Array:
"""Convert a FixedSizeListArray to a ListArray of Python lists.

This helper keeps nulls in place (``None`` elements remain null in the
resulting array) and constructs a ``ListArray`` with the same value type
as the input ``FixedSizeListArray``.
"""
# Each element in ``arr`` is either None or a sequence of child values.
py_lists = []
for sub in arr:
if sub is None:
py_lists.append(None)
else:
# Convert the child values (e.g., FixedSizeListArray[float]) to
# a plain Python list.
py_lists.append(list(sub))

# Explicitly use a list type with the same value type as the input
# FixedSizeListArray. PyArrow supports None values in the sequence, which
# become nulls in the resulting ListArray.
list_type = pyarrow.list_(arr.type.value_type)
return pyarrow.array(py_lists, type=list_type)


@dataclass
class _ArrayNamespace:
"""Namespace for fixed-size array operations on expression columns.

Example
-------
>>> from ray.data.expressions import col
>>> # "features" is a FixedSizeList column, e.g. 3-d embeddings
>>> expr = col("features").arr.to_list()
>>> # You can then use this expression inside Dataset.select/with_columns.
"""

_expr: "Expr"

def flatten(self) -> "UDFExpr":
"""Flatten each fixed-size array into a variable-length list.

For FixedSizeListArray inputs, this returns a column backed by an
Arrow ``ListArray``. We first try the native ``list_flatten`` compute
function, and fall back to a pure-Python implementation for older
Arrow versions that may not support this type.
"""
return_dtype = DataType(object)

@pyarrow_udf(return_dtype=return_dtype)
def _flatten(arr: pyarrow.Array) -> pyarrow.Array:
try:
# PyArrow supports flattening FixedSizeListArray via list_flatten.
# This returns a ListArray with variable-length lists.
return pc.list_flatten(arr)
except (TypeError, NotImplementedError):
# On older Arrow versions or unsupported types, fall back to a
# simple Python implementation that preserves nulls and shapes.
# If this fallback fails, its exception is propagated directly
# so we don't hide bugs in the fallback.
return _fixed_size_list_to_list_array(arr)

return _flatten(self._expr)

def to_list(self) -> "UDFExpr":
"""Convert each fixed-size array into a Python list.

The output is a variable-length list column (Arrow ListArray) that can
be consumed as Python lists when materialized from a Dataset.
"""
return_dtype = DataType(object)

@pyarrow_udf(return_dtype=return_dtype)
def _to_list(arr: pyarrow.Array) -> pyarrow.Array:
return _fixed_size_list_to_list_array(arr)

return _to_list(self._expr)
40 changes: 40 additions & 0 deletions python/ray/data/tests/test_namespace_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,46 @@ def test_struct_nested_bracket(self, dataset_format):
assert_df_equal(result, expected)


# ──────────────────────────────────────
# Array Namespace Tests
# ──────────────────────────────────────


def _make_fixed_size_list_table() -> pa.Table:
# Build a FixedSizeListArray with 3 rows, each of length 2:
# [[1, 2], [3, 4], [5, 6]]
values = pa.array([1, 2, 3, 4, 5, 6], type=pa.int64())
fixed = pa.FixedSizeListArray.from_arrays(values, list_size=2)
return pa.Table.from_arrays([fixed], names=["features"])


def test_arr_to_list(ray_start_regular):
table = _make_fixed_size_list_table()
ds = ray.data.from_arrow(table)

result = ds.select(col("features").arr.to_list().alias("features")).take(3)

assert result == [
{"features": [1, 2]},
{"features": [3, 4]},
{"features": [5, 6]},
]


def test_arr_flatten(ray_start_regular):
table = _make_fixed_size_list_table()
ds = ray.data.from_arrow(table)

result = ds.select(col("features").arr.flatten().alias("features")).take(3)

# For a simple FixedSizeListArray, flatten should behave like to_list
assert result == [
{"features": [1, 2]},
{"features": [3, 4]},
{"features": [5, 6]},
]


# ──────────────────────────────────────
# Integration Tests
# ──────────────────────────────────────
Expand Down