[Data] Fix driver hang during streaming generator block metadata retrieval (#56451)

dragongu · bveeramani · web-flow · commit 15b82aec035f · 2025-10-10T19:59:42.000-07:00
## Why are these changes needed? This PR fixes a critical driver hang issue in Ray Data's streaming generator. The problem occurs when computation completes and block data is generated, but the worker crashes before the metadata object is generated, causing the driver to hang completely until the task's metadata is successfully rebuilt. This creates severe performance issues, especially in cluster environments with significant resource fluctuations. ## What was the problem? **Specific scenario:** 1. Computation completes, block data is generated 2. Worker crashes before the metadata object is generated 3. Driver enters the [physical_operator.on_data_ready()](https://github.com/ray-project/ray/blob/ray-2.46.0/python/ray/data/_internal/execution/interfaces/physical_operator.py#L124) logic and waits indefinitely for metadata until task retry succeeds and meta object becomes available 4. If cluster resources are insufficient, the task cannot be retried successfully, causing driver to hang for hours (actual case: 12 hours) **Technical causes:** - Using `ray.get(next(self._streaming_gen))` for metadata content retrieval, which may hang indefinitely - Lack of timeout mechanisms and state tracking, preventing driver recovery from hang state - No proper handling when worker crashes between block generation and metadata generation ## What does this fix do? - Adds `_pending_block_ref` and `_pending_meta_ref` state tracking to properly handle block/metadata pairs - Uses `ray.get(meta_ref, timeout=1)` with timeout for metadata content retrieval - Adds error handling for `GetTimeoutError` with warning logs - Prevents unnecessary re-fetching of already obtained block references - **Key improvement: Prevents driver from hanging for extended periods when worker crashes between block and metadata generation** ## Related issue number Fixes critical performance issue in streaming data processing that causes driver to hang for extended periods (up to 12 hours) when workers crash between block generation and metadata generation, especially in cluster environments with significant resource fluctuations. ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - **Testing Strategy** - [x] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: dragongu <andrewgu@vip.qq.com> Signed-off-by: Balaji Veeramani <bveeramani@berkeley.edu> Co-authored-by: Balaji Veeramani <bveeramani@berkeley.edu> Co-authored-by: Balaji Veeramani <balaji@anyscale.com>
diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -30,6 +30,7 @@
 from ray.data._internal.output_buffer import OutputBlockSizeOption
 from ray.data._internal.progress_bar import ProgressBar
 from ray.data._internal.stats import StatsDict, Timer
+from ray.data.block import Block, BlockMetadata
 from ray.data.context import DataContext
 
 if TYPE_CHECKING:
@@ -38,6 +39,11 @@
 
 logger = logging.getLogger(__name__)
 
+# Timeout for getting metadata from Ray object references (in seconds)
+METADATA_GET_TIMEOUT_S = 1.0
+
+# Timeout for waiting for metadata object to become available (in seconds)
+METADATA_WAIT_TIMEOUT_S = 0.1
 
 # TODO(hchen): Ray Core should have a common interface for these two types.
 Waitable = Union[ray.ObjectRef, ObjectRefGenerator]
@@ -93,8 +99,8 @@ def __init__(
         self,
         task_index: int,
         streaming_gen: ObjectRefGenerator,
-        output_ready_callback: Callable[[RefBundle], None],
-        task_done_callback: Callable[[Optional[Exception]], None],
+        output_ready_callback: Callable[[RefBundle], None] = lambda bundle: None,
+        task_done_callback: Callable[[Optional[Exception]], None] = lambda exc: None,
         task_resource_bundle: Optional[ExecutionResources] = None,
     ):
         """Create a DataOpTask
@@ -115,6 +121,13 @@ def __init__(
         self._output_ready_callback = output_ready_callback
         self._task_done_callback = task_done_callback
 
+        # If the generator hasn't produced block metadata yet, or if the block metadata
+        # object isn't available after we get a reference, we need store the pending
+        # references and wait until Ray (re)constructs the block metadata. Either case
+        # can happen if a node dies after producing a block.
+        self._pending_block_ref: ray.ObjectRef[Block] = ray.ObjectRef.nil()
+        self._pending_meta_ref: ray.ObjectRef[BlockMetadata] = ray.ObjectRef.nil()
+
     def get_waitable(self) -> ObjectRefGenerator:
         return self._streaming_gen
 
@@ -128,42 +141,81 @@ def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int:
         """
         bytes_read = 0
         while max_bytes_to_read is None or bytes_read < max_bytes_to_read:
-            try:
-                block_ref = self._streaming_gen._next_sync(0)
-                if block_ref.is_nil():
+            if self._pending_block_ref.is_nil():
+                assert self._pending_meta_ref.is_nil(), (
+                    "This method expects streaming generators to yield blocks then "
+                    "metadata. So, if we have a reference to metadata but not the "
+                    "block, it means there's an error in the implementation."
+                )
+
+                try:
+                    self._pending_block_ref = self._streaming_gen._next_sync(
+                        timeout_s=0
+                    )
+                except StopIteration:
+                    self._task_done_callback(None)
+                    break
+
+                if self._pending_block_ref.is_nil():
                     # The generator currently doesn't have new output.
                     # And it's not stopped yet.
                     break
-            except StopIteration:
-                self._task_done_callback(None)
-                break
+
+            if self._pending_meta_ref.is_nil():
+                try:
+                    self._pending_meta_ref = self._streaming_gen._next_sync(
+                        timeout_s=METADATA_WAIT_TIMEOUT_S
+                    )
+                except StopIteration:
+                    # The generator should always yield 2 values (block and metadata)
+                    # each time. If we get a StopIteration here, it means an error
+                    # happened in the task.
+                    # And in this case, the block_ref is the exception object.
+                    # TODO(hchen): Ray Core should have a better interface for
+                    # detecting and obtaining the exception.
+                    try:
+                        ray.get(self._pending_block_ref)
+                        assert False, "Above ray.get should raise an exception."
+                    except Exception as ex:
+                        self._task_done_callback(ex)
+                        raise ex from None
+
+                if self._pending_meta_ref.is_nil():
+                    # We have a reference to the block but the metadata isn't ready
+                    # yet.
+                    break
 
             try:
+                # The timeout for `ray.get` includes the time required to ship the
+                # block metadata to this node. So, if we set the timeout to 0, `ray.get`
+                # will timeout and possible cancel the download. To avoid this issue,
+                # we set the timeout to a small non-zero value.
                 meta_with_schema: "BlockMetadataWithSchema" = ray.get(
-                    next(self._streaming_gen)
+                    self._pending_meta_ref, timeout=METADATA_GET_TIMEOUT_S
                 )
-            except StopIteration:
-                # The generator should always yield 2 values (block and metadata)
-                # each time. If we get a StopIteration here, it means an error
-                # happened in the task.
-                # And in this case, the block_ref is the exception object.
-                # TODO(hchen): Ray Core should have a better interface for
-                # detecting and obtaining the exception.
-                try:
-                    ray.get(block_ref)
-                    assert False, "Above ray.get should raise an exception."
-                except Exception as ex:
-                    self._task_done_callback(ex)
-                    raise ex from None
+            except ray.exceptions.GetTimeoutError:
+                # We have a reference to the block and its metadata, but the metadata
+                # object isn't available. This can happen if the node dies.
+                logger.warning(
+                    f"Metadata object not ready for "
+                    f"ref={self._pending_meta_ref.hex()} "
+                    f"(operator={self.__class__.__name__}). "
+                    f"Metadata may still be computing or worker may have failed and "
+                    f"object is being reconstructed. Will retry in next iteration."
+                )
+                break
 
             meta = meta_with_schema.metadata
             self._output_ready_callback(
                 RefBundle(
-                    [(block_ref, meta)],
+                    [(self._pending_block_ref, meta)],
                     owns_blocks=True,
                     schema=meta_with_schema.schema,
                 ),
             )
+            self._pending_block_ref = ray.ObjectRef.nil()
+            self._pending_meta_ref = ray.ObjectRef.nil()
+
             bytes_read += meta.size_bytes
 
         return bytes_read
diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py
@@ -9,7 +9,6 @@
 import threading
 import time
 import urllib.parse
-from collections import Counter
 from queue import Empty, Full, Queue
 from types import ModuleType
 from typing import (
@@ -1675,13 +1674,18 @@ def rows_same(actual: pd.DataFrame, expected: pd.DataFrame) -> bool:
     order of rows. This is useful for testing Ray Data because its interface doesn't
     usually guarantee the order of rows.
     """
-    actual_rows = actual.to_dict(orient="records")
-    expected_rows = expected.to_dict(orient="records")
+    if len(actual) == len(expected) == 0:
+        return True
 
-    actual_items_counts = Counter(frozenset(row.items()) for row in actual_rows)
-    expected_items_counts = Counter(frozenset(row.items()) for row in expected_rows)
-
-    return actual_items_counts == expected_items_counts
+    try:
+        pd.testing.assert_frame_equal(
+            actual.sort_values(sorted(actual.columns)).reset_index(drop=True),
+            expected.sort_values(sorted(expected.columns)).reset_index(drop=True),
+            check_dtype=False,
+        )
+        return True
+    except AssertionError:
+        return False
 
 
 def merge_resources_to_ray_remote_args(
diff --git a/python/ray/data/tests/preprocessors/test_discretizer.py b/python/ray/data/tests/preprocessors/test_discretizer.py
@@ -2,6 +2,7 @@
 import pytest
 
 import ray
+from ray.data._internal.util import rows_same
 from ray.data.preprocessors import CustomKBinsDiscretizer, UniformKBinsDiscretizer
 
 
@@ -55,28 +56,27 @@ def test_uniform_kbins_discretizer(
             labels_B = dtypes.get("B").categories
             ordered_B = dtypes.get("B").ordered
 
-    assert out_df["A"].equals(
-        pd.cut(
-            in_df["A"],
-            bins_A,
-            labels=labels_A,
-            ordered=ordered_A,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    # Create expected dataframe with transformed columns
+    expected_df = in_df.copy()
+    expected_df["A"] = pd.cut(
+        in_df["A"],
+        bins_A,
+        labels=labels_A,
+        ordered=ordered_A,
+        right=right,
+        include_lowest=include_lowest,
     )
-    assert out_df["B"].equals(
-        pd.cut(
-            in_df["B"],
-            bins_B,
-            labels=labels_B,
-            ordered=ordered_B,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    expected_df["B"] = pd.cut(
+        in_df["B"],
+        bins_B,
+        labels=labels_B,
+        ordered=ordered_B,
+        right=right,
+        include_lowest=include_lowest,
     )
-    # Check that the remaining column was not modified
-    assert out_df["C"].equals(in_df["C"])
+
+    # Use rows_same to compare regardless of row ordering
+    assert rows_same(out_df, expected_df)
 
     # append mode
     expected_message = "The length of columns and output_columns must match."
@@ -95,28 +95,27 @@ def test_uniform_kbins_discretizer(
     transformed = discretizer.fit_transform(ds)
     out_df = transformed.to_pandas()
 
-    assert out_df["A_discretized"].equals(
-        pd.cut(
-            in_df["A"],
-            bins_A,
-            labels=labels_A,
-            ordered=ordered_A,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    # Create expected dataframe with appended columns
+    expected_df = in_df.copy()
+    expected_df["A_discretized"] = pd.cut(
+        in_df["A"],
+        bins_A,
+        labels=labels_A,
+        ordered=ordered_A,
+        right=right,
+        include_lowest=include_lowest,
     )
-    assert out_df["B_discretized"].equals(
-        pd.cut(
-            in_df["B"],
-            bins_B,
-            labels=labels_B,
-            ordered=ordered_B,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    expected_df["B_discretized"] = pd.cut(
+        in_df["B"],
+        bins_B,
+        labels=labels_B,
+        ordered=ordered_B,
+        right=right,
+        include_lowest=include_lowest,
     )
-    # Check that the remaining column was not modified
-    assert out_df["C"].equals(in_df["C"])
+
+    # Use rows_same to compare regardless of row ordering
+    assert rows_same(out_df, expected_df)
 
 
 @pytest.mark.parametrize(
@@ -171,28 +170,27 @@ def test_custom_kbins_discretizer(
             labels_B = dtypes.get("B").categories
             ordered_B = dtypes.get("B").ordered
 
-    assert out_df["A"].equals(
-        pd.cut(
-            in_df["A"],
-            bins_A,
-            labels=labels_A,
-            ordered=ordered_A,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    # Create expected dataframe with transformed columns
+    expected_df = in_df.copy()
+    expected_df["A"] = pd.cut(
+        in_df["A"],
+        bins_A,
+        labels=labels_A,
+        ordered=ordered_A,
+        right=right,
+        include_lowest=include_lowest,
     )
-    assert out_df["B"].equals(
-        pd.cut(
-            in_df["B"],
-            bins_B,
-            labels=labels_B,
-            ordered=ordered_B,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    expected_df["B"] = pd.cut(
+        in_df["B"],
+        bins_B,
+        labels=labels_B,
+        ordered=ordered_B,
+        right=right,
+        include_lowest=include_lowest,
     )
-    # Check that the remaining column was not modified
-    assert out_df["C"].equals(in_df["C"])
+
+    # Use rows_same to compare regardless of row ordering
+    assert rows_same(out_df, expected_df)
 
     # append mode
     expected_message = "The length of columns and output_columns must match."
@@ -211,28 +209,27 @@ def test_custom_kbins_discretizer(
     transformed = discretizer.fit_transform(ds)
     out_df = transformed.to_pandas()
 
-    assert out_df["A_discretized"].equals(
-        pd.cut(
-            in_df["A"],
-            bins_A,
-            labels=labels_A,
-            ordered=ordered_A,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    # Create expected dataframe with appended columns
+    expected_df = in_df.copy()
+    expected_df["A_discretized"] = pd.cut(
+        in_df["A"],
+        bins_A,
+        labels=labels_A,
+        ordered=ordered_A,
+        right=right,
+        include_lowest=include_lowest,
     )
-    assert out_df["B_discretized"].equals(
-        pd.cut(
-            in_df["B"],
-            bins_B,
-            labels=labels_B,
-            ordered=ordered_B,
-            right=right,
-            include_lowest=include_lowest,
-        )
+    expected_df["B_discretized"] = pd.cut(
+        in_df["B"],
+        bins_B,
+        labels=labels_B,
+        ordered=ordered_B,
+        right=right,
+        include_lowest=include_lowest,
     )
-    # Check that the remaining column was not modified
-    assert out_df["C"].equals(in_df["C"])
+
+    # Use rows_same to compare regardless of row ordering
+    assert rows_same(out_df, expected_df)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py