Skip to content
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c2b03a8
[data]Compute Expressions-fixed-size-array-operations
400Ping Nov 18, 2025
b1615a8
fix prpperty
400Ping Nov 18, 2025
950ef28
add type-checking
400Ping Nov 18, 2025
7819f4b
[data] Improve array namespace docs and tests
400Ping Nov 21, 2025
0bc9400
fix linter
400Ping Nov 21, 2025
7a51d0b
fix
400Ping Nov 23, 2025
6cd7558
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 26, 2025
d666c74
fix
400Ping Nov 26, 2025
afb4571
fix linter
400Ping Nov 26, 2025
2957754
fix test
400Ping Nov 26, 2025
040877a
fix test_namespace_expressions
400Ping Nov 27, 2025
0fd0ff4
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 27, 2025
24f7eaf
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 29, 2025
c9b92a6
fix
400Ping Dec 1, 2025
804c228
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 1, 2025
a7e4520
Update python/ray/data/namespace_expressions/arr_namespace.py
400Ping Dec 1, 2025
70dbbd0
fix format
400Ping Dec 1, 2025
7a6a2fb
fix
400Ping Dec 5, 2025
244e2b0
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 10, 2025
2829610
fix
400Ping Dec 12, 2025
4717267
update
400Ping Dec 17, 2025
bb6fb05
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 17, 2025
c72eb5b
remove flatten()
400Ping Dec 31, 2025
71e4784
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 31, 2025
c46d1fa
Merge remote-tracking branch 'upstream/master' into data/compute-expr…
400Ping Jan 9, 2026
c3d3bb2
fix CI error
400Ping Jan 9, 2026
aa9ded9
update
400Ping Jan 14, 2026
2a283d8
fix
400Ping Jan 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions python/ray/data/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ray.util.annotations import DeveloperAPI, PublicAPI

if TYPE_CHECKING:
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace
from ray.data.namespace_expressions.dt_namespace import _DatetimeNamespace
from ray.data.namespace_expressions.list_namespace import _ListNamespace
from ray.data.namespace_expressions.string_namespace import _StringNamespace
Expand Down Expand Up @@ -487,6 +488,20 @@ def struct(self) -> "_StructNamespace":

return _StructNamespace(self)

@property
def arr(self) -> "_ArrayNamespace":
"""Access fixed-size array operations for this expression.

Example
-------
>>> from ray.data.expressions import col
>>> # Assume ``features`` is a FixedSizeList column
>>> expr = col("features").arr.flatten()
"""
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace

return _ArrayNamespace(self)

@property
def dt(self) -> "_DatetimeNamespace":
"""Access datetime operations for this expression."""
Expand Down Expand Up @@ -1069,6 +1084,7 @@ def download(uri_column_name: str) -> DownloadExpr:
"_ListNamespace",
"_StringNamespace",
"_StructNamespace",
"_ArrayNamespace",
"_DatetimeNamespace",
]

Expand All @@ -1087,6 +1103,10 @@ def __getattr__(name: str):
from ray.data.namespace_expressions.struct_namespace import _StructNamespace

return _StructNamespace
elif name == "_ArrayNamespace":
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace

return _ArrayNamespace
elif name == "_DatetimeNamespace":
from ray.data.namespace_expressions.dt_namespace import _DatetimeNamespace

Expand Down
107 changes: 107 additions & 0 deletions python/ray/data/namespace_expressions/arr_namespace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""
Fixed-size array namespace for expression operations.

This namespace handles Arrow ``FixedSizeListArray`` columns. It provides
helper methods for flattening nested arrays and converting them into
Python lists.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import pyarrow
import pyarrow.compute as pc

from ray.data.datatype import DataType
from ray.data.expressions import pyarrow_udf

if TYPE_CHECKING:
from ray.data.expressions import Expr, UDFExpr


@dataclass
class _ArrayNamespace:
"""Namespace for fixed-size array operations on expression columns.

Example
-------
>>> from ray.data.expressions import col
>>> # "features" is a FixedSizeList column, e.g. 3-d embeddings
>>> expr = col("features").arr.to_list()
>>> # You can then use this expression inside Dataset.select/with_columns.
"""

_expr: "Expr"

def flatten(self) -> "UDFExpr":
"""Flattens a column of fixed-size lists of lists into a single list.

This is a thin wrapper around :func:`pyarrow.compute.list_flatten`.

For example, a column with type
``fixed_size_list(list<item: int64>, 2)`` will be converted to a
``list<item: int64>`` column where each row's nested lists are
concatenated.

This operation is only valid for nested list types. If the input
is not a list-of-lists, PyArrow will raise a type error.
"""
# Default to object if we cannot infer a better type.
return_dtype = DataType(object)

expr_dtype = self._expr.data_type
if expr_dtype.is_list_type():
outer_arrow_type = expr_dtype.to_arrow_dtype()
inner_arrow_type = outer_arrow_type.value_type

# Inner list type; list_flatten(list<list<T>>) -> list<T>
inner_dtype = DataType.from_arrow(inner_arrow_type)
if inner_dtype.is_list_type():
value_arrow_type = inner_arrow_type.value_type
return_dtype = DataType.from_arrow(pyarrow.list_(value_arrow_type))

@pyarrow_udf(return_dtype=return_dtype)
def _flatten(arr: pyarrow.Array) -> pyarrow.Array:
return pc.list_flatten(arr)

return _flatten(self._expr)

def to_list(self) -> "UDFExpr":
"""Convert each fixed-size array into a variable-length list.

This converts a ``FixedSizeListArray`` into an equivalent
``ListArray`` with the same value type, using PyArrow's cast
operator. Non-fixed-size list arrays are returned unchanged.
"""
# Default to object; refine when we know more.
return_dtype = DataType(object)

expr_dtype = self._expr.data_type
if expr_dtype.is_list_type():
arrow_type = expr_dtype.to_arrow_dtype()
if pyarrow.types.is_fixed_size_list(arrow_type):
# FixedSizeListArray<T> -> ListArray<T>
return_dtype = DataType.from_arrow(pyarrow.list_(arrow_type.value_type))
else:
# Other list-like arrays are returned unchanged.
return_dtype = expr_dtype

@pyarrow_udf(return_dtype=return_dtype)
def _to_list(arr: pyarrow.Array) -> pyarrow.Array:
# Validate that this is a list-like Arrow type using DataType.
arr_dtype = DataType.from_arrow(arr.type)
if not arr_dtype.is_list_type():
raise pyarrow.lib.ArrowInvalid(
"arr.to_list() can only be called on list-like columns, "
f"but got {arr.type}"
)

# Only FixedSizeListArray needs conversion; other list-like
# types are returned unchanged.
if isinstance(arr.type, pyarrow.FixedSizeListType):
return arr.cast(pyarrow.list_(arr.type.value_type))
return arr

return _to_list(self._expr)
69 changes: 69 additions & 0 deletions python/ray/data/tests/test_namespace_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,75 @@ def test_dt_namespace_invalid_dtype_raises(ray_start_regular):
ds.select(col("value").dt.year()).to_pandas()


# ──────────────────────────────────────
# Array Namespace Tests
# ──────────────────────────────────────


def _make_fixed_size_list_table() -> pa.Table:
# Build a FixedSizeListArray with 3 rows, each of length 2:
# [[1, 2], [3, 4], [5, 6]]
values = pa.array([1, 2, 3, 4, 5, 6], type=pa.int64())
fixed = pa.FixedSizeListArray.from_arrays(values, list_size=2)
return pa.Table.from_arrays([fixed], names=["features"])


def _make_nested_fixed_size_list_table() -> pa.Table:
# Build a nested FixedSizeListArray with 3 rows:
#
# row0 -> [[1, 2], [3, 4]]
# row1 -> [[5, 6], [7, 8]]
# row2 -> [[9, 10], [11, 12]]
#
# Type is: fixed_size_list(list<int64>, 2)
inner_type = pa.list_(pa.int64())
inner_values = pa.array(
[
[1, 2],
[3, 4],
[5, 6],
[7, 8],
[9, 10],
[11, 12],
],
type=inner_type,
)
fixed_nested = pa.FixedSizeListArray.from_arrays(inner_values, list_size=2)
return pa.Table.from_arrays([fixed_nested], names=["features"])


def test_arr_to_list(ray_start_regular):
table = _make_fixed_size_list_table()
ds = ray.data.from_arrow(table)

result = ds.select(col("features").arr.to_list().alias("features")).to_pandas()
expected = pd.DataFrame(
[
{"features": [1, 2]},
{"features": [3, 4]},
{"features": [5, 6]},
]
)

assert rows_same(result, expected)


def test_arr_flatten(ray_start_regular):
table = _make_nested_fixed_size_list_table()
ds = ray.data.from_arrow(table)

result = ds.select(col("features").arr.flatten().alias("features")).to_pandas()
expected = pd.DataFrame(
[
{"features": [1, 2, 3, 4]},
{"features": [5, 6, 7, 8]},
{"features": [9, 10, 11, 12]},
]
)

assert rows_same(result, expected)


# ──────────────────────────────────────
# Integration Tests
# ──────────────────────────────────────
Expand Down