[Data] Fix driver hang during streaming generator block metadata retrieval

Shallow Copy Bot · Shallow Copy Bot · commit 6adb3c7dda1e · 2025-10-22T18:50:51.000Z
Original PR #56451 by dragongu Original: ray-project/ray#56451
diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -30,6 +30,7 @@
 from ray.data._internal.output_buffer import OutputBlockSizeOption
 from ray.data._internal.progress_bar import ProgressBar
 from ray.data._internal.stats import StatsDict, Timer
+from ray.data.block import Block, BlockMetadata
 from ray.data.context import DataContext
 
 if TYPE_CHECKING:
@@ -38,6 +39,11 @@
 
 logger = logging.getLogger(__name__)
 
+# Timeout for getting metadata from Ray object references (in seconds)
+METADATA_GET_TIMEOUT_S = 1.0
+
+# Timeout for waiting for metadata object to become available (in seconds)
+METADATA_WAIT_TIMEOUT_S = 0.1
 
 # TODO(hchen): Ray Core should have a common interface for these two types.
 Waitable = Union[ray.ObjectRef, ObjectRefGenerator]
@@ -93,8 +99,8 @@ def __init__(
         self,
         task_index: int,
         streaming_gen: ObjectRefGenerator,
-        output_ready_callback: Callable[[RefBundle], None],
-        task_done_callback: Callable[[Optional[Exception]], None],
+        output_ready_callback: Callable[[RefBundle], None] = lambda bundle: None,
+        task_done_callback: Callable[[Optional[Exception]], None] = lambda exc: None,
         task_resource_bundle: Optional[ExecutionResources] = None,
     ):
         """Create a DataOpTask
@@ -115,6 +121,13 @@ def __init__(
         self._output_ready_callback = output_ready_callback
         self._task_done_callback = task_done_callback
 
+        # If the generator hasn't produced block metadata yet, or if the block metadata
+        # object isn't available after we get a reference, we need store the pending
+        # references and wait until Ray (re)constructs the block metadata. Either case
+        # can happen if a node dies after producing a block.
+        self._pending_block_ref: ray.ObjectRef[Block] = ray.ObjectRef.nil()
+        self._pending_meta_ref: ray.ObjectRef[BlockMetadata] = ray.ObjectRef.nil()
+
     def get_waitable(self) -> ObjectRefGenerator:
         return self._streaming_gen
 
@@ -128,42 +141,81 @@ def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int:
         """
         bytes_read = 0
         while max_bytes_to_read is None or bytes_read < max_bytes_to_read:
-            try:
-                block_ref = self._streaming_gen._next_sync(0)
-                if block_ref.is_nil():
+            if self._pending_block_ref.is_nil():
+                assert self._pending_meta_ref.is_nil(), (
+                    "This method expects streaming generators to yield blocks then "
+                    "metadata. So, if we have a reference to metadata but not the "
+                    "block, it means there's an error in the implementation."
+                )
+
+                try:
+                    self._pending_block_ref = self._streaming_gen._next_sync(
+                        timeout_s=0
+                    )
+                except StopIteration:
+                    self._task_done_callback(None)
+                    break
+
+                if self._pending_block_ref.is_nil():
                     # The generator currently doesn't have new output.
                     # And it's not stopped yet.
                     break
-            except StopIteration:
-                self._task_done_callback(None)
-                break
+
+            if self._pending_meta_ref.is_nil():
+                try:
+                    self._pending_meta_ref = self._streaming_gen._next_sync(
+                        timeout_s=METADATA_WAIT_TIMEOUT_S
+                    )
+                except StopIteration:
+                    # The generator should always yield 2 values (block and metadata)
+                    # each time. If we get a StopIteration here, it means an error
+                    # happened in the task.
+                    # And in this case, the block_ref is the exception object.
+                    # TODO(hchen): Ray Core should have a better interface for
+                    # detecting and obtaining the exception.
+                    try:
+                        ray.get(self._pending_block_ref)
+                        assert False, "Above ray.get should raise an exception."
+                    except Exception as ex:
+                        self._task_done_callback(ex)
+                        raise ex from None
+
+                if self._pending_meta_ref.is_nil():
+                    # We have a reference to the block but the metadata isn't ready
+                    # yet.
+                    break
 
             try:
+                # The timeout for `ray.get` includes the time required to ship the
+                # block metadata to this node. So, if we set the timeout to 0, `ray.get`
+                # will timeout and possible cancel the download. To avoid this issue,
+                # we set the timeout to a small non-zero value.
                 meta_with_schema: "BlockMetadataWithSchema" = ray.get(
-                    next(self._streaming_gen)
+                    self._pending_meta_ref, timeout=METADATA_GET_TIMEOUT_S
                 )
-            except StopIteration:
-                # The generator should always yield 2 values (block and metadata)
-                # each time. If we get a StopIteration here, it means an error
-                # happened in the task.
-                # And in this case, the block_ref is the exception object.
-                # TODO(hchen): Ray Core should have a better interface for
-                # detecting and obtaining the exception.
-                try:
-                    ray.get(block_ref)
-                    assert False, "Above ray.get should raise an exception."
-                except Exception as ex:
-                    self._task_done_callback(ex)
-                    raise ex from None
+            except ray.exceptions.GetTimeoutError:
+                # We have a reference to the block and its metadata, but the metadata
+                # object isn't available. This can happen if the node dies.
+                logger.warning(
+                    f"Metadata object not ready for "
+                    f"ref={self._pending_meta_ref.hex()} "
+                    f"(operator={self.__class__.__name__}). "
+                    f"Metadata may still be computing or worker may have failed and "
+                    f"object is being reconstructed. Will retry in next iteration."
+                )
+                break
 
             meta = meta_with_schema.metadata
             self._output_ready_callback(
                 RefBundle(
-                    [(block_ref, meta)],
+                    [(self._pending_block_ref, meta)],
                     owns_blocks=True,
                     schema=meta_with_schema.schema,
                 ),
             )
+            self._pending_block_ref = ray.ObjectRef.nil()
+            self._pending_meta_ref = ray.ObjectRef.nil()
+
             bytes_read += meta.size_bytes
 
         return bytes_read
diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py
@@ -9,7 +9,6 @@
 import threading
 import time
 import urllib.parse
-from collections import Counter
 from queue import Empty, Full, Queue
 from types import ModuleType
 from typing import (
@@ -1675,13 +1674,18 @@ def rows_same(actual: pd.DataFrame, expected: pd.DataFrame) -> bool:
     order of rows. This is useful for testing Ray Data because its interface doesn't
     usually guarantee the order of rows.
     """
-    actual_rows = actual.to_dict(orient="records")
-    expected_rows = expected.to_dict(orient="records")
+    if len(actual) == len(expected) == 0:
+        return True
 
-    actual_items_counts = Counter(frozenset(row.items()) for row in actual_rows)
-    expected_items_counts = Counter(frozenset(row.items()) for row in expected_rows)
-
-    return actual_items_counts == expected_items_counts
+    try:
+        pd.testing.assert_frame_equal(
+            actual.sort_values(sorted(actual.columns)).reset_index(drop=True),
+            expected.sort_values(sorted(expected.columns)).reset_index(drop=True),
+            check_dtype=False,
+        )
+        return True
+    except AssertionError:
+        return False
 
 
 def merge_resources_to_ray_remote_args(
diff --git a/python/ray/data/tests/preprocessors/test_discretizer.py b/python/ray/data/tests/preprocessors/test_discretizer.py
@@ -2,6 +2,7 @@
 import pytest
 
 import ray
+from ray.data._internal.util import rows_same
 from ray.data.preprocessors import CustomKBinsDiscretizer, UniformKBinsDiscretizer
 
 
@@ -55,28 +56,27 @@ def test_uniform_kbins_discretizer(
             labels_B = dtypes.get("B").categories
             ordered_B = dtypes.get("B").ordered
 
-    assert out_df["A"].equals(
-        pd.cut(
-            in_df["A"],
-            bins_A,
-            labels=labels_A,
-            ordered=ordered_A,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    # Create expected dataframe with transformed columns
+    expected_df = in_df.copy()
+    expected_df["A"] = pd.cut(
+        in_df["A"],
+        bins_A,
+        labels=labels_A,
+        ordered=ordered_A,
+        right=right,
+        include_lowest=include_lowest,
     )
-    assert out_df["B"].equals(
-        pd.cut(
-            in_df["B"],
-            bins_B,
-            labels=labels_B,
-            ordered=ordered_B,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    expected_df["B"] = pd.cut(
+        in_df["B"],
+        bins_B,
+        labels=labels_B,
+        ordered=ordered_B,
+        right=right,
+        include_lowest=include_lowest,
     )
-    # Check that the remaining column was not modified
-    assert out_df["C"].equals(in_df["C"])
+
+    # Use rows_same to compare regardless of row ordering
+    assert rows_same(out_df, expected_df)
 
     # append mode
     expected_message = "The length of columns and output_columns must match."
@@ -95,28 +95,27 @@ def test_uniform_kbins_discretizer(
     transformed = discretizer.fit_transform(ds)
     out_df = transformed.to_pandas()
 
-    assert out_df["A_discretized"].equals(
-        pd.cut(
-            in_df["A"],
-            bins_A,
-            labels=labels_A,
-            ordered=ordered_A,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    # Create expected dataframe with appended columns
+    expected_df = in_df.copy()
+    expected_df["A_discretized"] = pd.cut(
+        in_df["A"],
+        bins_A,
+        labels=labels_A,
+        ordered=ordered_A,
+        right=right,
+        include_lowest=include_lowest,
     )
-    assert out_df["B_discretized"].equals(
-        pd.cut(
-            in_df["B"],
-            bins_B,
-            labels=labels_B,
-            ordered=ordered_B,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    expected_df["B_discretized"] = pd.cut(
+        in_df["B"],
+        bins_B,
+        labels=labels_B,
+        ordered=ordered_B,
+        right=right,
+        include_lowest=include_lowest,
     )
-    # Check that the remaining column was not modified
-    assert out_df["C"].equals(in_df["C"])
+
+    # Use rows_same to compare regardless of row ordering
+    assert rows_same(out_df, expected_df)
 
 
 @pytest.mark.parametrize(
@@ -171,28 +170,27 @@ def test_custom_kbins_discretizer(
             labels_B = dtypes.get("B").categories
             ordered_B = dtypes.get("B").ordered
 
-    assert out_df["A"].equals(
-        pd.cut(
-            in_df["A"],
-            bins_A,
-            labels=labels_A,
-            ordered=ordered_A,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    # Create expected dataframe with transformed columns
+    expected_df = in_df.copy()
+    expected_df["A"] = pd.cut(
+        in_df["A"],
+        bins_A,
+        labels=labels_A,
+        ordered=ordered_A,
+        right=right,
+        include_lowest=include_lowest,
     )
-    assert out_df["B"].equals(
-        pd.cut(
-            in_df["B"],
-            bins_B,
-            labels=labels_B,
-            ordered=ordered_B,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    expected_df["B"] = pd.cut(
+        in_df["B"],
+        bins_B,
+        labels=labels_B,
+        ordered=ordered_B,
+        right=right,
+        include_lowest=include_lowest,
     )
-    # Check that the remaining column was not modified
-    assert out_df["C"].equals(in_df["C"])
+
+    # Use rows_same to compare regardless of row ordering
+    assert rows_same(out_df, expected_df)
 
     # append mode
     expected_message = "The length of columns and output_columns must match."
@@ -211,28 +209,27 @@ def test_custom_kbins_discretizer(
     transformed = discretizer.fit_transform(ds)
     out_df = transformed.to_pandas()
 
-    assert out_df["A_discretized"].equals(
-        pd.cut(
-            in_df["A"],
-            bins_A,
-            labels=labels_A,
-            ordered=ordered_A,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    # Create expected dataframe with appended columns
+    expected_df = in_df.copy()
+    expected_df["A_discretized"] = pd.cut(
+        in_df["A"],
+        bins_A,
+        labels=labels_A,
+        ordered=ordered_A,
+        right=right,
+        include_lowest=include_lowest,
     )
-    assert out_df["B_discretized"].equals(
-        pd.cut(
-            in_df["B"],
-            bins_B,
-            labels=labels_B,
-            ordered=ordered_B,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    expected_df["B_discretized"] = pd.cut(
+        in_df["B"],
+        bins_B,
+        labels=labels_B,
+        ordered=ordered_B,
+        right=right,
+        include_lowest=include_lowest,
     )
-    # Check that the remaining column was not modified
-    assert out_df["C"].equals(in_df["C"])
+
+    # Use rows_same to compare regardless of row ordering
+    assert rows_same(out_df, expected_df)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py