update all docs

apoorvkh · apoorvkh · commit 9e2d5f4799e1 · 2024-10-29T21:37:06.000-04:00
diff --git a/README.md b/README.md
@@ -56,12 +56,13 @@ Here's a simple example where we "train" a model on two nodes (with 2 GPUs each)
 import torchrunx as trx
 
 if __name__ == "__main__":
-    trained_model = trx.launch(
+    result = trx.launch(
         func=train,
         hostnames=["localhost", "other_node"],
-        workers_per_host=2  # num. GPUs
-    ).value(rank=0)  # get returned object
+        workers_per_host=2  # number of GPUs
+    )
 
+    trained_model = result.rank(0)
     torch.save(trained_model.state_dict(), "model.pth")
 ```
 
@@ -70,9 +71,9 @@ if __name__ == "__main__":
 
 ## Why should I use this?
 
-Whether you have 1 GPU, 8 GPUs, or 8 machines.
+Whether you have 1 GPU, 8 GPUs, or 8 machines:
 
-__Features:__
+__Features__
 
 - Our [`launch()`](https://torchrunx.readthedocs.io/stable/api.html#torchrunx.launch) utility is super _Pythonic_
     - Return objects from your workers
@@ -81,13 +82,13 @@ __Features:__
 - Fine-grained control over logging, environment variables, exception handling, etc.
 - Automatic integration with SLURM
 
-__Robustness:__
+__Robustness__
 
 - If you want to run a complex, _modular_ workflow in __one__ script
   - don't parallelize your entire script: just the functions you want!
   - no worries about memory leaks or OS failures
 
-__Convenience:__
+__Convenience__
 
 - If you don't want to:
   - set up [`dist.init_process_group`](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) yourself
diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
@@ -14,19 +14,19 @@ We could also launch multiple functions (e.g. train on many GPUs, test on one GP
         func=train,
         hostnames=["node1", "node2"],
         workers_per_host=8
-    ).value(rank=0)
+    ).rank(0)
 
     accuracy = trx.launch(
         func=test,
-        func_kwargs={'model': model},
+        func_args=(trained_model,),
         hostnames=["localhost"],
         workers_per_host=1
-    ).value(rank=0)
+    ).rank(0)
 
     print(f'Accuracy: {accuracy}')
 
 
-:mod:`torchrunx.launch` is self-cleaning: all processes are terminated (and the used memory is completely released) after each invocation.
+:mod:`torchrunx.launch` is self-cleaning: all processes are terminated (and the used memory is completely released) before the subsequent invocation.
 
 Launcher class
 --------------
@@ -85,9 +85,9 @@ Raises a ``RuntimeError`` if ``hostnames="slurm"`` or ``workers_per_host="slurm"
 Propagating exceptions
 ----------------------
 
-Exceptions that are raised in Workers will be raised by the launcher process.
+Exceptions that are raised in workers will be raised by the launcher process.
 
-A :mod:`torchrunx.AgentKilledError` will be raised if any agent dies unexpectedly (e.g. if force-killed by the OS, due to segmentation faults or OOM).
+A :mod:`torchrunx.AgentFailedError` or :mod:`torchrunx.WorkerFailedError` will be raised if any agent or worker dies unexpectedly (e.g. if sent a signal from the OS, due to segmentation faults or OOM).
 
 Environment variables
 ---------------------
@@ -100,14 +100,14 @@ Environment variables in the launcher process that match the ``default_env_vars`
 Custom logging
 --------------
 
-We forward all logs (i.e. from ``logging`` and ``stdio``) from workers and agents to the Launcher. By default, the logs from the first agent and its first worker are printed into the Launcher's ``stdout`` stream. Logs from all agents and workers are written to files in ``$TORCHRUNX_LOG_DIR`` (default: ``./torchrunx_logs``) and are named by timestamp, hostname, and local_rank.
+We forward all logs (i.e. from :mod:`logging` and :mod:`sys.stdin`/:mod:`sys.stdout`) from workers and agents to the launcher. By default, the logs from the first agent and its first worker are printed into the launcher's ``stdout`` stream. Logs from all agents and workers are written to files in ``$TORCHRUNX_LOG_DIR`` (default: ``./torchrunx_logs``) and are named by timestamp, hostname, and local_rank.
 
-``logging.Handler`` objects can be provided via the ``log_handlers`` argument to provide further customization (mapping specific agents/workers to custom output streams).
+:mod:`logging.Handler` objects can be provided via the ``log_handlers`` argument to provide further customization (mapping specific agents/workers to custom output streams).
 
 We provide some utilities to help:
 
-.. autofunction:: torchrunx.add_filter_to_handler
-
 .. autofunction:: torchrunx.file_handler
 
 .. autofunction:: torchrunx.stream_handler
+
+.. autofunction:: torchrunx.add_filter_to_handler
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -6,4 +6,6 @@ API
 .. autoclass:: torchrunx.LaunchResult
   :members:
 
-.. autoclass:: torchrunx.AgentKilledError
+.. autoclass:: torchrunx.AgentFailedError
+
+.. autoclass:: torchrunx.WorkerFailedError
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -17,8 +17,8 @@
     "myst_parser",
     "sphinx_toolbox.sidebar_links",
     "sphinx_toolbox.github",
-    "sphinx.ext.autodoc.typehints",
     "sphinx.ext.napoleon",
+    "sphinx.ext.autodoc.typehints",
     "sphinx.ext.linkcode",
 ]
 
diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py
@@ -39,10 +39,7 @@
 
 @dataclass
 class Launcher:
-    """Alias class for ``torchrunx.launch``.
-
-    Useful for sequential invocations on the same configuration or for specifying arguments via CLI.
-    """
+    """Useful for sequential invocations or for specifying arguments via CLI."""
 
     hostnames: list[str] | Literal["auto", "slurm"] = "auto"
     workers_per_host: int | list[int] | Literal["auto", "slurm"] = "auto"
@@ -69,7 +66,7 @@ def run(  # noqa: C901, PLR0912
         func_kwargs: dict[str, Any] | None = None,
         log_handlers: list[Handler] | Literal["auto"] | None = "auto",
     ) -> LaunchResult:
-        """Run a function using the configuration in ``torchrunx.Launcher``."""
+        """Run a function using the :mod:`torchrunx.Launcher` configuration."""
         if not dist.is_available():
             msg = "The torch.distributed package is not available."
             raise RuntimeError(msg)
@@ -267,21 +264,21 @@ class LaunchResult:
     hostnames: list[str]
     return_values: list[list[Any]]
 
-    def by_hostname(self) -> dict[str, list[Any]]:
+    def by_hostnames(self) -> dict[str, list[Any]]:
         """All return values from workers, indexed by host and local rank."""
         return dict(zip(self.hostnames, self.return_values))
 
-    def by_rank(self) -> list[Any]:
+    def by_ranks(self) -> list[Any]:
         """All return values from workers, indexed by global rank."""
         return reduce(add, self.return_values)
 
-    def get(self, hostname: str, rank: int) -> Any:
-        """Get return value from worker (indexed by host and local rank)."""
+    def index(self, hostname: str, rank: int) -> Any:
+        """Get return value from worker by host and local rank."""
         return self.return_values[self.hostnames.index(hostname)][rank]
 
-    def rank(self, idx: int) -> Any:
-        """Get return value from worker (indexed by global rank)."""
-        return self.by_rank()[idx]
+    def rank(self, i: int) -> Any:
+        """Get return value from worker by global rank."""
+        return self.by_rank()[i]
 
 
 def _resolve_hostnames(hostnames: list[str] | Literal["auto", "slurm"]) -> list[str]:
diff --git a/src/torchrunx/utils/logging.py b/src/torchrunx/utils/logging.py
@@ -41,10 +41,10 @@ def add_filter_to_handler(
     local_rank: int | None,  # None indicates agent
     log_level: int = logging.NOTSET,
 ) -> None:
-    """A filter for ``logging.Handler`` such that only specific agent/worker logs are handled.
+    """A filter for :mod:`logging.Handler` such that only specific agent/worker logs are handled.
 
     Args:
-        handler: ``logging.Handler`` to be modified.
+        handler: Handler to be modified.
         hostname: Name of specified host.
         local_rank: Rank of specified worker (or ``None`` for agent).
         log_level: Minimum log level to capture.
@@ -63,7 +63,7 @@ def _filter(record: WorkerLogRecord) -> bool:
 def stream_handler(
     hostname: str, local_rank: int | None, log_level: int = logging.NOTSET
 ) -> Handler:
-    """logging.Handler builder function for writing logs to stdout."""
+    """Handler builder function for writing logs from specified hostname/rank to stdout."""
     handler = logging.StreamHandler(stream=sys.stdout)
     add_filter_to_handler(handler, hostname, local_rank, log_level=log_level)
     handler.setFormatter(
@@ -82,7 +82,7 @@ def file_handler(
     file_path: str | os.PathLike,
     log_level: int = logging.NOTSET,
 ) -> Handler:
-    """logging.Handler builder function for writing logs to a file."""
+    """Handler builder function for writing logs from specified hostname/rank to a file."""
     handler = logging.FileHandler(file_path)
     add_filter_to_handler(handler, hostname, local_rank, log_level=log_level)
     formatter = logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")
@@ -96,7 +96,7 @@ def file_handlers(
     log_dir: str | os.PathLike = Path("torchrunx_logs"),
     log_level: int = logging.NOTSET,
 ) -> list[Handler]:
-    """Builder function for writing logs for all workers/agents to a directory.
+    """Handler builder function for writing logs for all workers/agents to a directory.
 
     Files are named with timestamp, hostname, and the local_rank (for workers).
     """
@@ -123,9 +123,9 @@ def default_handlers(
     log_dir: str | os.PathLike = Path("torchrunx_logs"),
     log_level: int = logging.INFO,
 ) -> list[Handler]:
-    """A default set of logging.Handlers to be used when ``launch(log_handlers="auto")``.
+    """Default :mod:`logging.Handler`s for ``log_handlers="auto"`` in :mod:`torchrunx.launch`.
 
-    Logs for host[0] and its local_rank[0] worker are written to the launcher process stdout.
+    Logs for ``host[0]`` and its ``local_rank[0]`` worker are written to launcher process stdout.
     Logs for all agents/workers are written to files in ``log_dir`` (named by timestamp, hostname,
     local_rank).
     """

Original file line number	Diff line number	Diff line change
`@@ -17,8 +17,8 @@`
`17`	`17`	`"myst_parser",`
`18`	`18`	`"sphinx_toolbox.sidebar_links",`
`19`	`19`	`"sphinx_toolbox.github",`
`20`		`- "sphinx.ext.autodoc.typehints",`
`21`	`20`	`"sphinx.ext.napoleon",`
	`21`	`+ "sphinx.ext.autodoc.typehints",`
`22`	`22`	`"sphinx.ext.linkcode",`
`23`	`23`	`]`
`24`	`24`