Update PjRt registry to use Status QOL functions

ysiraichi · ysiraichi · commit 9ebf41c0f5fa · 2025-07-01T13:28:31.000-03:00
Replace `XLA_CHECK` macros with `XLA_ASSIGN_OR_RETURN` and
`XLA_RETURN_IF_ERROR` for better error handling:
- `InitializePjRt()` now returns `StatusOr&lt;T&gt;` instead of throwing on
errors
- Enhanced error messages with location context
- Consistent error handling across all device types
diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD
@@ -212,6 +212,7 @@ cc_library(
         "//torch_xla/csrc:status",
         "@torch//:headers",
         "@com_google_absl//absl/log:initialize",
+        "@com_google_absl//absl/log:absl_check",
         "@xla//xla/pjrt:pjrt_c_api_client",
         "@xla//xla/pjrt:tfrt_cpu_pjrt_client",
         "@xla//xla/pjrt/gpu:se_gpu_pjrt_client",
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.cpp b/torch_xla/csrc/runtime/ifrt_computation_client.cpp
@@ -123,7 +123,8 @@ std::vector<std::string> IfrtComputationClient::IfrtDevicesToString(
 IfrtComputationClient::IfrtComputationClient() {
   std::string device_type = sys_util::GetEnvString(env::kEnvPjRtDevice, "");
   std::unique_ptr<xla::PjRtClient> pjrt_client;
-  std::tie(pjrt_client, coordinator_) = InitializePjRt(device_type);
+  std::tie(pjrt_client, coordinator_) =
+      GetValueOrThrow(InitializePjRt(device_type));
 
   client_ = xla::ifrt::PjRtClient::Create(std::move(pjrt_client));
 
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cpp b/torch_xla/csrc/runtime/pjrt_computation_client.cpp
@@ -117,7 +117,8 @@ std::vector<std::string> PjRtComputationClient::PjRtDevicesToString(
 
 PjRtComputationClient::PjRtComputationClient() {
   std::string device_type = sys_util::GetEnvString(env::kEnvPjRtDevice, "");
-  std::tie(client_, coordinator_) = InitializePjRt(device_type);
+  std::tie(client_, coordinator_) =
+      GetValueOrThrow(InitializePjRt(device_type));
 
   // PjRtDevice IDs are not guaranteed to be dense, so we need to track
   // a device's global ordinal separately from its device ID. Order the
diff --git a/torch_xla/csrc/runtime/pjrt_registry.cpp b/torch_xla/csrc/runtime/pjrt_registry.cpp
@@ -2,6 +2,7 @@
 
 #include <c10/util/Exception.h>
 
+#include "absl/log/absl_check.h"
 #include "absl/log/initialize.h"
 #include "torch_xla/csrc/runtime/debug_macros.h"
 #include "torch_xla/csrc/runtime/env_vars.h"
@@ -74,7 +75,8 @@ void RegisterPjRtPlugin(std::string name,
   pjrt_plugins_[name] = plugin;
 }
 
-std::tuple<std::unique_ptr<xla::PjRtClient>, std::unique_ptr<XlaCoordinator>>
+absl::StatusOr<std::tuple<absl_nonnull std::unique_ptr<xla::PjRtClient>,
+                          std::unique_ptr<XlaCoordinator>>>
 InitializePjRt(const std::string& device_type) {
   std::unique_ptr<xla::PjRtClient> client;
   std::unique_ptr<XlaCoordinator> coordinator;
@@ -110,28 +112,34 @@ InitializePjRt(const std::string& device_type) {
                    << ", coordinator address=" << master_addr << ":" << port;
 
         // Use the XlaCoordinator as the distributed key-value store.
-        coordinator = GetValueOrThrow(XlaCoordinator::Create(
-            global_process_rank, global_world_size, master_addr, port));
+        XLA_ASSIGN_OR_RETURN(
+            coordinator,
+            XlaCoordinator::Create(global_process_rank, global_world_size,
+                                   master_addr, port));
         std::shared_ptr<xla::DistributedRuntimeClient> distributed_client =
             coordinator->GetClient();
         kv_store = xla::GetDistributedKeyValueStore(distributed_client,
                                                     /*key_prefix=*/"pjrt:");
       }
-      const PJRT_Api* c_api = *pjrt::LoadPjrtPlugin(
-          absl::AsciiStrToLower(device_type), plugin->library_path());
-      XLA_CHECK_OK(pjrt::InitializePjrtPlugin(device_type));
+      XLA_ASSIGN_OR_RETURN(
+          const PJRT_Api* c_api,
+          pjrt::LoadPjrtPlugin(absl::AsciiStrToLower(device_type),
+                               plugin->library_path()));
+      XLA_RETURN_IF_ERROR(pjrt::InitializePjrtPlugin(device_type));
       auto create_options = plugin->client_create_options();
-      client = xla::GetCApiClient(
-                   absl::AsciiStrToUpper(device_type),
-                   {create_options.begin(), create_options.end()}, kv_store)
-                   .value();
+      XLA_ASSIGN_OR_RETURN(
+          client,
+          xla::GetCApiClient(absl::AsciiStrToUpper(device_type),
+                             {create_options.begin(), create_options.end()},
+                             kv_store));
       profiler::RegisterProfilerForPlugin(c_api);
     }
   } else if (device_type == "CPU") {
     TF_VLOG(1) << "Initializing PjRt CPU client...";
     bool async = sys_util::GetEnvBool(env::kEnvPjrtAsyncCpuClient, true);
     int cpu_device_count = sys_util::GetEnvInt(env::kEnvNumCpu, 1);
-    client = std::move(xla::GetPjRtCpuClient(async, cpu_device_count).value());
+    XLA_ASSIGN_OR_RETURN(client,
+                         xla::GetPjRtCpuClient(async, cpu_device_count));
   } else if (device_type == "TPU") {
     TF_VLOG(1) << "Initializing TFRT TPU client...";
     // Init the absl logging to avoid the log spam.
@@ -140,15 +148,14 @@ InitializePjRt(const std::string& device_type) {
     auto tpu_library_path = sys_util::GetEnvString(
         env::kEnvTpuLibraryPath,
         sys_util::GetEnvString(env::kEnvInferredTpuLibraryPath, "libtpu.so"));
-    XLA_CHECK_OK(pjrt::LoadPjrtPlugin("tpu", tpu_library_path).status());
-    absl::Status tpu_status = pjrt::InitializePjrtPlugin("tpu");
-    XLA_CHECK_OK(tpu_status);
-    client = std::move(xla::GetCApiClient("TPU").value());
-    const PJRT_Api* c_api =
-        static_cast<xla::PjRtCApiClient*>(client.get())->pjrt_c_api();
+    XLA_ASSIGN_OR_RETURN(const PJRT_Api* c_api,
+                         pjrt::LoadPjrtPlugin("tpu", tpu_library_path));
+    XLA_RETURN_IF_ERROR(pjrt::InitializePjrtPlugin("tpu"));
+    XLA_ASSIGN_OR_RETURN(client, xla::GetCApiClient("TPU"));
     profiler::RegisterProfilerForPlugin(c_api);
   } else if (device_type == "TPU_LEGACY") {
-    XLA_ERROR() << "TPU_LEGACY client is no longer available.";
+    return XLA_ERROR_WITH_LOCATION(absl::InvalidArgumentError(
+        "TPU_LEGACY client is no longer available."));
   } else if (device_type == "CUDA") {
     TORCH_WARN("The XLA:CUDA device is deprecated in release 2.8. ",
                "Future releases might remove XLA:CUDA support entirely. ",
@@ -183,8 +190,10 @@ InitializePjRt(const std::string& device_type) {
           runtime::sys_util::GetEnvString("MASTER_ADDR", "localhost");
       std::string port = runtime::sys_util::GetEnvString(
           "XLA_COORDINATOR_PORT", XlaCoordinator::kDefaultCoordinatorPort);
-      coordinator = GetValueOrThrow(XlaCoordinator::Create(
-          global_process_rank, global_world_size, master_addr, port));
+      XLA_ASSIGN_OR_RETURN(
+          coordinator,
+          XlaCoordinator::Create(global_process_rank, global_world_size,
+                                 master_addr, port));
       std::shared_ptr<xla::DistributedRuntimeClient> distributed_client =
           coordinator->GetClient();
       kv_store = xla::GetDistributedKeyValueStore(distributed_client,
@@ -199,27 +208,25 @@ InitializePjRt(const std::string& device_type) {
     options.platform_name = "gpu";
     options.should_stage_host_to_device_transfers = true;
     options.kv_store = kv_store;
-    client = std::move(xla::GetStreamExecutorGpuClient(options).value());
+    XLA_ASSIGN_OR_RETURN(client, xla::GetStreamExecutorGpuClient(options));
   } else if (device_type == "XPU") {
     TF_VLOG(1) << "Initializing PjRt XPU client...";
-    XLA_CHECK_OK(
-        pjrt::LoadPjrtPlugin(
-            "xpu", sys_util::GetEnvString(env::kEnvXpuLibraryPath, "libxpu.so"))
-            .status());
-    client = std::move(xla::GetCApiClient("XPU").value());
+    XLA_RETURN_IF_ERROR(pjrt::LoadPjrtPlugin(
+        "xpu", sys_util::GetEnvString(env::kEnvXpuLibraryPath, "libxpu.so")));
+    XLA_ASSIGN_OR_RETURN(client, xla::GetCApiClient("XPU"));
   } else if (device_type == "NEURON") {
     TF_VLOG(1) << "Initializing PjRt NEURON client...";
-    XLA_CHECK_OK(pjrt::LoadPjrtPlugin("NEURON", sys_util::GetEnvString(
-                                                    env::kEnvNeuronLibraryPath,
-                                                    "libneuronpjrt.so"))
-                     .status());
-    client = std::move(xla::GetCApiClient("NEURON").value());
+    XLA_RETURN_IF_ERROR(pjrt::LoadPjrtPlugin(
+        "NEURON", sys_util::GetEnvString(env::kEnvNeuronLibraryPath,
+                                         "libneuronpjrt.so")));
+    XLA_ASSIGN_OR_RETURN(client, xla::GetCApiClient("NEURON"));
+  } else {
+    return XLA_ERROR_WITH_LOCATION(absl::InvalidArgumentError(absl::StrCat(
+        "Unknown ", env::kEnvPjRtDevice, ": '", device_type, "'")));
   }
 
-  XLA_CHECK(client) << absl::StrFormat("Unknown %s '%s'", env::kEnvPjRtDevice,
-                                       device_type);
-
-  return {std::move(client), std::move(coordinator)};
+  ABSL_CHECK(client);
+  return std::make_tuple(std::move(client), std::move(coordinator));
 }
 
 }  // namespace runtime
diff --git a/torch_xla/csrc/runtime/pjrt_registry.h b/torch_xla/csrc/runtime/pjrt_registry.h
@@ -21,7 +21,8 @@ class PjRtPlugin {
 void RegisterPjRtPlugin(std::string name,
                         std::shared_ptr<const PjRtPlugin> plugin);
 
-std::tuple<std::unique_ptr<xla::PjRtClient>, std::unique_ptr<XlaCoordinator>>
+absl::StatusOr<std::tuple<absl_nonnull std::unique_ptr<xla::PjRtClient>,
+                          std::unique_ptr<XlaCoordinator>>>
 InitializePjRt(const std::string& device_type);
 
 }  // namespace runtime