-
Notifications
You must be signed in to change notification settings - Fork 7.3k
[data][train] Refactor call_with_retry into shared library and use it to retry checkpoint upload #56608
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
justinvyu
merged 10 commits into
ray-project:master
from
TimothySeah:tseah/retry-checkpoint-upload
Sep 23, 2025
Merged
[data][train] Refactor call_with_retry into shared library and use it to retry checkpoint upload #56608
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
c80ebff
[data][train] Refactor call_with_retry into shared library
TimothySeah 6d1de38
address pr comments
TimothySeah eb6630f
add todo
TimothySeah 2e6ad07
Merge remote-tracking branch 'upstream/master' into tseah/retry-check…
TimothySeah f981df9
moved retryable tokens to train with aws prefixes
TimothySeah 0323ee6
fix assertion
TimothySeah f701580
convert exception to str once + use 20% jitter
TimothySeah 17154f5
Merge remote-tracking branch 'upstream/master' into tseah/retry-check…
TimothySeah d13a897
attempt to fix timeout by making average retry wait the same as before
TimothySeah ce89a27
fix unit test
TimothySeah File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| import functools | ||
| import logging | ||
| import random | ||
| import time | ||
| from typing import Any, Callable, List, Optional | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def call_with_retry( | ||
| f: Callable, | ||
| description: str, | ||
| match: Optional[List[str]] = None, | ||
| max_attempts: int = 10, | ||
| max_backoff_s: int = 32, | ||
| *args, | ||
| **kwargs, | ||
| ) -> Any: | ||
| """Retry a function with exponential backoff. | ||
|
|
||
| Args: | ||
| f: The function to retry. | ||
| description: An imperative description of the function being retried. For | ||
| example, "open the file". | ||
| match: A list of strings to match in the exception message. If ``None``, any | ||
| error is retried. | ||
| max_attempts: The maximum number of attempts to retry. | ||
| max_backoff_s: The maximum number of seconds to backoff. | ||
| *args: Arguments to pass to the function. | ||
| **kwargs: Keyword arguments to pass to the function. | ||
|
|
||
| Returns: | ||
| The result of the function. | ||
| """ | ||
| # TODO: consider inverse match and matching exception type | ||
| assert max_attempts >= 1, f"`max_attempts` must be positive. Got {max_attempts}." | ||
|
|
||
| for i in range(max_attempts): | ||
| try: | ||
| return f(*args, **kwargs) | ||
| except Exception as e: | ||
| exception_str = str(e) | ||
| is_retryable = match is None or any( | ||
| pattern in exception_str for pattern in match | ||
| ) | ||
| if is_retryable and i + 1 < max_attempts: | ||
| # Retry with binary exponential backoff with 20% random jitter. | ||
| backoff = min(2**i, max_backoff_s) * (random.uniform(0.8, 1.2)) | ||
| logger.debug( | ||
| f"Retrying {i+1} attempts to {description} after {backoff} seconds." | ||
| ) | ||
| time.sleep(backoff) | ||
| else: | ||
| if is_retryable: | ||
| logger.debug( | ||
| f"Failed to {description} after {max_attempts} attempts. Raising." | ||
| ) | ||
| else: | ||
| logger.debug( | ||
| f"Did not find a match for {exception_str}. Raising after {i+1} attempts." | ||
| ) | ||
| raise e from None | ||
|
|
||
|
|
||
| def retry( | ||
| description: str, | ||
| match: Optional[List[str]] = None, | ||
| max_attempts: int = 10, | ||
| max_backoff_s: int = 32, | ||
| ) -> Callable: | ||
| """Decorator-based version of call_with_retry.""" | ||
|
|
||
| def decorator(func: Callable) -> Callable: | ||
| @functools.wraps(func) | ||
| def inner(*args, **kwargs): | ||
| return call_with_retry( | ||
| func, description, match, max_attempts, max_backoff_s, *args, **kwargs | ||
| ) | ||
|
|
||
| return inner | ||
|
|
||
| return decorator |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| import sys | ||
|
|
||
| import pytest | ||
|
|
||
| from ray._common.retry import ( | ||
| call_with_retry, | ||
| retry, | ||
| ) | ||
|
|
||
|
|
||
| def test_call_with_retry_immediate_success_with_args(): | ||
| def func(a, b): | ||
| return [a, b] | ||
|
|
||
| assert call_with_retry(func, "func", [], 1, 0, "a", "b") == ["a", "b"] | ||
|
|
||
|
|
||
| def test_retry_immediate_success_with_object_args(): | ||
| class MyClass: | ||
| @retry("func", [], 1, 0) | ||
| def func(self, a, b): | ||
| return [a, b] | ||
|
|
||
| assert MyClass().func("a", "b") == ["a", "b"] | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("use_decorator", [True, False]) | ||
| def test_retry_last_attempt_successful_with_appropriate_wait_time( | ||
| monkeypatch, use_decorator | ||
| ): | ||
| sleep_total = 0 | ||
|
|
||
| def sleep(x): | ||
| nonlocal sleep_total | ||
| sleep_total += x | ||
|
|
||
| monkeypatch.setattr("time.sleep", sleep) | ||
| monkeypatch.setattr("random.uniform", lambda a, b: 1) | ||
|
|
||
| pattern = "have not reached 4th attempt" | ||
| call_count = 0 | ||
|
|
||
| def func(): | ||
| nonlocal call_count | ||
| call_count += 1 | ||
| if call_count == 4: | ||
| return "success" | ||
| raise ValueError(pattern) | ||
|
|
||
| args = ["func", [pattern], 4, 3] | ||
| if use_decorator: | ||
| assert retry(*args)(func)() == "success" | ||
| else: | ||
| assert call_with_retry(func, *args) == "success" | ||
| assert sleep_total == 6 # 1 + 2 + 3 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("use_decorator", [True, False]) | ||
| def test_retry_unretryable_error(use_decorator): | ||
| call_count = 0 | ||
|
|
||
| def func(): | ||
| nonlocal call_count | ||
| call_count += 1 | ||
| raise ValueError("unretryable error") | ||
|
|
||
| args = ["func", ["only retryable error"], 10, 0] | ||
| with pytest.raises(ValueError, match="unretryable error"): | ||
| if use_decorator: | ||
| retry(*args)(func)() | ||
| else: | ||
| call_with_retry(func, *args) | ||
| assert call_count == 1 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("use_decorator", [True, False]) | ||
| def test_retry_fail_all_attempts_retry_all_errors(use_decorator): | ||
| call_count = 0 | ||
|
|
||
| def func(): | ||
| nonlocal call_count | ||
| call_count += 1 | ||
| raise ValueError(str(call_count)) | ||
|
|
||
| args = ["func", None, 3, 0] | ||
| with pytest.raises(ValueError): | ||
| if use_decorator: | ||
| retry(*args)(func)() | ||
| else: | ||
| call_with_retry(func, *args) | ||
| assert call_count == 3 | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| sys.exit(pytest.main(["-sv", __file__])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.