Migrate to Hermetic CUDA.

ysiraichi · ysiraichi · commit ba8b2621d148 · 2025-05-27T20:04:40.000-03:00
- Update bazel files.
- Use `clang` for PyTorch/XLA only.
- Fix StableHLO tests.
- Compile debugging information with dwarf-4.
diff --git a/.bazelrc b/.bazelrc
@@ -53,14 +53,8 @@ build --copt=-fexceptions
 #    safer than o+rx.
 build --spawn_strategy=sandboxed
 
-# Use GCC for C/C++ compilation.
-build --action_env=CC=gcc
-build --action_env=CXX=g++
-
-###########################################################################
-
-build:clang --action_env=CC=/usr/bin/clang-17
-build:clang --action_env=CXX=/usr/bin/clang++-17
+build --action_env=CC=clang
+build --action_env=CXX=clang++
 
 ###########################################################################
 
@@ -85,8 +79,22 @@ build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
 build:cuda --define=xla_python_enable_gpu=true
+# Define XLA_CUDA for C++ files.
 build:cuda --cxxopt=-DXLA_CUDA=1
 
+# Default hermetic CUDA and CUDNN versions.
+build:cuda --repo_env HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90"
+build:cuda --repo_env HERMETIC_CUDA_VERSION="12.3.2"
+build:cuda --repo_env HERMETIC_CUDNN_VERSION="9.1.1"
+
+# Link NCCL statically, as it was with the non-hermetic build.
+build:cuda --repo_env TF_NCCL_USE_STUB=0
+
+# Use NVCC for compiling CUDA.
+build:cuda --action_env TF_NVCC_CLANG=1
+build:cuda --@local_config_cuda//:cuda_compiler=nvcc
+build:cuda --@local_config_cuda//cuda:include_cuda_libs=false
+
 # Coverage with cuda/gcc/nvcc requires manually setting coverage flags.
 coverage:cuda --per_file_copt=third_party/.*,torch_xla/.*@--coverage
 coverage:cuda --linkopt=-lgcov
@@ -254,8 +262,10 @@ build:linux --copt="-Werror=unused-result"
 build:linux --copt="-Wswitch"
 build:linux --copt="-Werror=switch"
 # Required for building with clang
-build:linux --copt="-Wno-error=unused-but-set-variable"
+build:linux --copt="-Qunused-arguments"
+build:linux --copt="-Wno-unused-command-line-argument"
 
 # Only include debug info for files not under XLA.
 build:dbg -c dbg
+build:dbg --copt=-gdwarf-4
 build:dbg --per_file_copt=external/xla/.*@-g0,-DNDEBUG
diff --git a/WORKSPACE b/WORKSPACE
@@ -56,7 +56,6 @@ http_archive(
     ],
     patch_tool = "patch",
     patches = [
-        "//openxla_patches:gpu_nvml.diff",
         "//openxla_patches:gpu_race_condition.diff",
         "//openxla_patches:count_down.diff",
     ],
@@ -134,17 +133,49 @@ load("@xla//:workspace0.bzl", "xla_workspace0")
 
 xla_workspace0()
 
+load(
+   "@xla//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl",
+   "cuda_json_init_repository",
+)
+
+cuda_json_init_repository()
+
+load(
+   "@cuda_redist_json//:distributions.bzl",
+   "CUDA_REDISTRIBUTIONS",
+   "CUDNN_REDISTRIBUTIONS",
+)
+load(
+   "@xla//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
+   "cuda_redist_init_repositories",
+   "cudnn_redist_init_repository",
+)
+
+cuda_redist_init_repositories(
+   cuda_redistributions = CUDA_REDISTRIBUTIONS,
+)
+
+cudnn_redist_init_repository(
+   cudnn_redistributions = CUDNN_REDISTRIBUTIONS,
+)
 
 load(
-    "@xla//third_party/gpus:cuda_configure.bzl",
-    "cuda_configure",
+   "@xla//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
+   "cuda_configure",
 )
 
 cuda_configure(name = "local_config_cuda")
 
 load(
-    "@xla//third_party/nccl:nccl_configure.bzl",
-    "nccl_configure",
+   "@xla//third_party/nccl/hermetic:nccl_redist_init_repository.bzl",
+   "nccl_redist_init_repository",
+)
+
+nccl_redist_init_repository()
+
+load(
+   "@xla//third_party/nccl/hermetic:nccl_configure.bzl",
+   "nccl_configure",
 )
 
 nccl_configure(name = "local_config_nccl")
diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml
@@ -2,11 +2,8 @@
 # They'll be accessible for all processes on the host, also in the development image.
 release_env:
   common:
-    # Force GCC because clang/bazel has issues.
     CC: gcc-10
     CXX: g++-10
-    # CC: "clang-{{ clang_version }}"
-    # CXX: "clang++-{{ clang_version }}"
     LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib"
 
   tpu:
@@ -49,3 +46,7 @@ build_env:
     ACCELERATOR: tpu
     TPUVM_MODE: 1
     BUNDLE_LIBTPU: "{{ bundle_libtpu }}"
+
+clang_compiler:
+  CC: /usr/lib/{{ clang_version }}/bin/clang
+  CXX: /usr/lib/{{ clang_version }}/bin/clang++
diff --git a/infra/ansible/playbook.yaml b/infra/ansible/playbook.yaml
@@ -85,6 +85,7 @@
             combine(build_env[arch] | default({}, true)) |
             combine(build_env[accelerator] | default({}, true))
           }}"
+        clang_compiler: "{{ clang_compiler }}"
       when: stage == "build"
       tags: build_srcs
 
@@ -94,7 +95,8 @@
         env_vars: "{{
             build_env.common | default({}, true) |
             combine(build_env[arch] | default({}, true)) |
-            combine(build_env[accelerator] | default({}, true))
+            combine(build_env[accelerator] | default({}, true)) |
+            combine(clang_compiler)
           }}"
       when: stage == "build_plugin"
       tags: build_plugin
diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml
@@ -46,7 +46,7 @@
   ansible.builtin.command:
     cmd: python setup.py bdist_wheel
     chdir: "{{ (src_root, 'pytorch/xla') | path_join }}"
-  environment: "{{ env_vars }}"
+  environment: "{{ env_vars | combine(clang_compiler) }}"
 
 - name: Find XLA *.whl files in pytorch/xla/dist
   ansible.builtin.find:
diff --git a/test/test_operations.py b/test/test_operations.py
@@ -2209,8 +2209,8 @@ def test_inplace_mul_scalar_different_dtype(self):
     def fn(inp, s):
       return inp.mul_(s)
 
-    inp = torch.rand(10, dtype=torch.half)
-    s = torch.tensor(7, dtype=torch.double)
+    inp = torch.arange(10).to(torch.half)
+    s = torch.tensor(3, dtype=torch.double)
 
     Xinp = inp.to(xm.xla_device())
     Xs = s.to(xm.xla_device())
diff --git a/torch_xla/_internal/decomp_registration.py b/torch_xla/_internal/decomp_registration.py
@@ -1,4 +1,4 @@
 import torch
 
-torch.library.register_kernel("aten::upsample_trilinear3d", "xla",
+torch.library.register_kernel("aten::upsample_trilinear3d", "XLA",
                               torch._decomp.decompositions.upsample_trilinear3d)
diff --git a/torch_xla/csrc/runtime/stablehlo_composite_helper.cpp b/torch_xla/csrc/runtime/stablehlo_composite_helper.cpp
@@ -16,8 +16,6 @@
 namespace torch_xla {
 namespace runtime {
 
-namespace {
-
 using nlohmann::json;
 
 static bool IsXlaMarkTensorOp(mlir::Operation* op) {
@@ -529,8 +527,6 @@ class RemoveXlaMarkTensorOpsPass
   }
 };
 
-}  // namespace
-
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateBuildStableHLOCompositePass() {
   return std::make_unique<BuildStableHLOCompositePass>();
diff --git a/torch_xla/csrc/runtime/xla_mlir_debuginfo_helper.cpp b/torch_xla/csrc/runtime/xla_mlir_debuginfo_helper.cpp
@@ -9,8 +9,6 @@
 namespace torch_xla {
 namespace runtime {
 
-namespace {
-
 // Defined in torch_xla/experimental/xla_mlir_debuginfo.py
 static constexpr char XLA_MLIR_DEBUGINFO_BEGIN[] = "<XLA_MLIR_DEBUGINFO_BEGIN>";
 static constexpr char XLA_MLIR_DEBUGINFO_END[] = "<XLA_MLIR_DEBUGINFO_END>";
@@ -81,8 +79,6 @@ class PrepareXlaMlirDebuginfoPass : public mlir::OperationPass<mlir::ModuleOp> {
   }
 };
 
-}  // namespace
-
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreatePrepareXlaMlirDebuginfoPass() {
   return std::make_unique<PrepareXlaMlirDebuginfoPass>();