Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c2b03a8
[data]Compute Expressions-fixed-size-array-operations
400Ping Nov 18, 2025
b1615a8
fix prpperty
400Ping Nov 18, 2025
950ef28
add type-checking
400Ping Nov 18, 2025
7819f4b
[data] Improve array namespace docs and tests
400Ping Nov 21, 2025
0bc9400
fix linter
400Ping Nov 21, 2025
7a51d0b
fix
400Ping Nov 23, 2025
6cd7558
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 26, 2025
d666c74
fix
400Ping Nov 26, 2025
afb4571
fix linter
400Ping Nov 26, 2025
2957754
fix test
400Ping Nov 26, 2025
040877a
fix test_namespace_expressions
400Ping Nov 27, 2025
0fd0ff4
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 27, 2025
24f7eaf
Merge branch 'master' into data/compute-expressions-arr
400Ping Nov 29, 2025
c9b92a6
fix
400Ping Dec 1, 2025
804c228
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 1, 2025
a7e4520
Update python/ray/data/namespace_expressions/arr_namespace.py
400Ping Dec 1, 2025
70dbbd0
fix format
400Ping Dec 1, 2025
7a6a2fb
fix
400Ping Dec 5, 2025
244e2b0
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 10, 2025
2829610
fix
400Ping Dec 12, 2025
4717267
update
400Ping Dec 17, 2025
bb6fb05
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 17, 2025
c72eb5b
remove flatten()
400Ping Dec 31, 2025
71e4784
Merge branch 'master' into data/compute-expressions-arr
400Ping Dec 31, 2025
c46d1fa
Merge remote-tracking branch 'upstream/master' into data/compute-expr…
400Ping Jan 9, 2026
c3d3bb2
fix CI error
400Ping Jan 9, 2026
aa9ded9
update
400Ping Jan 14, 2026
2a283d8
fix
400Ping Jan 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions python/ray/data/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ray.util.annotations import DeveloperAPI, PublicAPI

if TYPE_CHECKING:
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace
from ray.data.namespace_expressions.dt_namespace import _DatetimeNamespace
from ray.data.namespace_expressions.list_namespace import _ListNamespace
from ray.data.namespace_expressions.string_namespace import _StringNamespace
Expand Down Expand Up @@ -418,12 +419,20 @@ def alias(self, name: str) -> "Expr":
data_type=self.data_type, expr=self, _name=name, _is_rename=False
)

@property
def arr(self) -> "_ArrayNamespace":
"""Access array operations for this expression."""
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace

return _ArrayNamespace(self)

@property
def list(self) -> "_ListNamespace":
"""Access list operations for this expression.

Returns:
A _ListNamespace that provides list-specific operations.
A _ListNamespace that provides list-specific operations for both
PyArrow ``List`` and ``FixedSizeList`` columns.

Example:
>>> from ray.data.expressions import col
Expand Down Expand Up @@ -1066,6 +1075,7 @@ def download(uri_column_name: str) -> DownloadExpr:
"lit",
"download",
"star",
"_ArrayNamespace",
"_ListNamespace",
"_StringNamespace",
"_StructNamespace",
Expand All @@ -1075,7 +1085,11 @@ def download(uri_column_name: str) -> DownloadExpr:

def __getattr__(name: str):
"""Lazy import of namespace classes to avoid circular imports."""
if name == "_ListNamespace":
if name == "_ArrayNamespace":
from ray.data.namespace_expressions.arr_namespace import _ArrayNamespace

return _ArrayNamespace
elif name == "_ListNamespace":
from ray.data.namespace_expressions.list_namespace import _ListNamespace

return _ListNamespace
Expand Down
54 changes: 54 additions & 0 deletions python/ray/data/namespace_expressions/arr_namespace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Array namespace for expression operations on array-typed columns."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import pyarrow

from ray.data.datatype import DataType
from ray.data.expressions import pyarrow_udf

if TYPE_CHECKING:
from ray.data.expressions import Expr, UDFExpr


@dataclass
class _ArrayNamespace:
"""Namespace for array operations on expression columns.

Example:
>>> from ray.data.expressions import col
>>> # Convert fixed-size lists to variable-length lists
>>> expr = col("features").arr.to_list()
"""

_expr: Expr

def to_list(self) -> "UDFExpr":
"""Convert FixedSizeList columns into variable-length lists."""
return_dtype = DataType(object)

expr_dtype = self._expr.data_type
if expr_dtype.is_list_type():
arrow_type = expr_dtype.to_arrow_dtype()
if pyarrow.types.is_fixed_size_list(arrow_type):
return_dtype = DataType.from_arrow(pyarrow.list_(arrow_type.value_type))
else:
return_dtype = expr_dtype

@pyarrow_udf(return_dtype=return_dtype)
def _to_list(arr: pyarrow.Array) -> pyarrow.Array:
arr_dtype = DataType.from_arrow(arr.type)
if not arr_dtype.is_list_type():
raise pyarrow.lib.ArrowInvalid(
"to_list() can only be called on list-like columns, "
f"but got {arr.type}"
)

if isinstance(arr.type, pyarrow.FixedSizeListType):
return arr.cast(pyarrow.list_(arr.type.value_type))
return arr

return _to_list(self._expr)
27 changes: 27 additions & 0 deletions python/ray/data/namespace_expressions/list_namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class _ListNamespace:
>>> expr = col("items").list[0]
>>> # Slice list
>>> expr = col("items").list[1:3]
>>> # Flatten nested list columns
>>> expr = col("features").list.flatten()
"""

_expr: Expr
Expand Down Expand Up @@ -120,3 +122,28 @@ def _list_slice(arr: pyarrow.Array) -> pyarrow.Array:
)

return _list_slice(self._expr)

def flatten(self) -> "UDFExpr":
"""Flatten nested list columns (including FixedSizeList variants).

This wraps :func:`pyarrow.compute.list_flatten` and preserves the inner
element type when possible.
"""
return_dtype = DataType(object)

expr_dtype = self._expr.data_type
if expr_dtype.is_list_type():
outer_arrow_type = expr_dtype.to_arrow_dtype()
inner_arrow_type = outer_arrow_type.value_type

# Inner list type; list_flatten(list<list<T>>) -> list<T>
inner_dtype = DataType.from_arrow(inner_arrow_type)
if inner_dtype.is_list_type():
value_arrow_type = inner_arrow_type.value_type
return_dtype = DataType.from_arrow(pyarrow.list_(value_arrow_type))

@pyarrow_udf(return_dtype=return_dtype)
def _flatten(arr: pyarrow.Array) -> pyarrow.Array:
return pc.list_flatten(arr)

return _flatten(self._expr)
69 changes: 69 additions & 0 deletions python/ray/data/tests/test_namespace_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,75 @@ def test_dt_namespace_invalid_dtype_raises(ray_start_regular):
ds.select(col("value").dt.year()).to_pandas()


# ──────────────────────────────────────
# List Namespace Tests (including fixed-size lists)
# ──────────────────────────────────────


def _make_fixed_size_list_table() -> pa.Table:
# Build a FixedSizeListArray with 3 rows, each of length 2:
# [[1, 2], [3, 4], [5, 6]]
values = pa.array([1, 2, 3, 4, 5, 6], type=pa.int64())
fixed = pa.FixedSizeListArray.from_arrays(values, list_size=2)
return pa.Table.from_arrays([fixed], names=["features"])


def _make_nested_fixed_size_list_table() -> pa.Table:
# Build a nested FixedSizeListArray with 3 rows:
#
# row0 -> [[1, 2], [3, 4]]
# row1 -> [[5, 6], [7, 8]]
# row2 -> [[9, 10], [11, 12]]
#
# Type is: fixed_size_list(list<int64>, 2)
inner_type = pa.list_(pa.int64())
inner_values = pa.array(
[
[1, 2],
[3, 4],
[5, 6],
[7, 8],
[9, 10],
[11, 12],
],
type=inner_type,
)
fixed_nested = pa.FixedSizeListArray.from_arrays(inner_values, list_size=2)
return pa.Table.from_arrays([fixed_nested], names=["features"])


def test_arr_to_list_fixed_size(ray_start_regular):
table = _make_fixed_size_list_table()
ds = ray.data.from_arrow(table)

result = ds.select(col("features").arr.to_list().alias("features")).to_pandas()
expected = pd.DataFrame(
[
{"features": [1, 2]},
{"features": [3, 4]},
{"features": [5, 6]},
]
)

assert rows_same(result, expected)


def test_list_flatten_fixed_size(ray_start_regular):
table = _make_nested_fixed_size_list_table()
ds = ray.data.from_arrow(table)

result = ds.select(col("features").list.flatten().alias("features")).to_pandas()
expected = pd.DataFrame(
[
{"features": [1, 2, 3, 4]},
{"features": [5, 6, 7, 8]},
{"features": [9, 10, 11, 12]},
]
)

assert rows_same(result, expected)


# ──────────────────────────────────────
# Integration Tests
# ──────────────────────────────────────
Expand Down