[ROCm] add ROCm support (pt. 2) (#7039)

jeffdaily · jameslamb · shiyu1994 · web-flow · commit e5bd90d6aadc · 2026-03-06T01:33:24.000+08:00
* [ROCm] re-add support for ROCm builds Previously #6086 added ROCm support but after numerous rebases it lost critical changes. This PR restores the ROCm build. There are many source file changes but most were automated using the following: ```bash for f in `grep -rl '#ifdef USE_CUDA'` do sed -i 's@#ifdef USE_CUDA@#if defined(USE_CUDA) || defined(USE_ROCM)@g' $f done for f in `grep -rl '#endif // USE_CUDA'` do sed -i 's@#endif // USE_CUDA@#endif // USE_CUDA || USE_ROCM@g' $f done ``` * Fix error in cpp_tests/test_arrow.cpp. error: explicit specialization in non-namespace scope ‘class ArrowChunkedArrayTest’ * update for ROCm 7 BC-breaking change to warpSize * lint * Revert "Fix error in cpp_tests/test_arrow.cpp." This reverts commit e461e86. * partial revert of 61ec4f1 Instead of replacing all #ifdef USE_CUDA, just add USE_CUDA define to ROCm build. * add --use-rocm option to build-python.sh * fix cuda build missing CUDASUCCESS_OR_FATAL in vector_cudahost.h * add rocm docs * fix doc using pre-commit * apply reviewer suggestions * fix build-python.sh doc * fix build for rocm 7.0 --------- Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -286,6 +286,9 @@ if(USE_ROCM)
     endif()
     message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
 
+    # Building for ROCm almost always means USE_CUDA.
+    # Exceptions to this will be guarded by USE_ROCM.
+    add_definitions(-DUSE_CUDA)
     add_definitions(-DUSE_ROCM)
 endif()
 
@@ -473,10 +476,21 @@ set(
       src/cuda/cuda_algorithms.cu
 )
 
-if(USE_CUDA)
+if(USE_CUDA OR USE_ROCM)
   list(APPEND LGBM_SOURCES ${LGBM_CUDA_SOURCES})
 endif()
 
+if(USE_ROCM)
+  set(CU_FILES "")
+  foreach(file IN LISTS LGBM_CUDA_SOURCES)
+      string(REGEX MATCH "\\.cu$" is_cu_file "${file}")
+      if(is_cu_file)
+          list(APPEND CU_FILES "${file}")
+      endif()
+  endforeach()
+  set_source_files_properties(${CU_FILES} PROPERTIES LANGUAGE HIP)
+endif()
+
 add_library(lightgbm_objs OBJECT ${LGBM_SOURCES})
 
 if(BUILD_CLI)
@@ -629,6 +643,10 @@ if(USE_CUDA)
   endif()
 endif()
 
+if(USE_ROCM)
+  target_link_libraries(lightgbm_objs PUBLIC hip::host)
+endif()
+
 if(WIN32)
     if(MINGW OR CYGWIN)
       target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi)
diff --git a/build-python.sh b/build-python.sh
@@ -54,6 +54,8 @@
 #     --precompile
 #                                   Use precompiled library.
 #                                   Only used with 'install' command.
+#     --rocm
+#                                   Compile ROCm version.
 #     --time-costs
 #                                   Compile version that outputs time costs for different internal routines.
 #     --user
@@ -142,6 +144,9 @@ while [ $# -gt 0 ]; do
     --cuda)
         BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_CUDA=ON"
         ;;
+    --rocm)
+        BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_ROCM=ON"
+        ;;
     --gpu)
         BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_GPU=ON"
         ;;
diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst
@@ -749,6 +749,65 @@ macOS
 
 The CUDA version is not supported on macOS.
 
+Build ROCm Version
+~~~~~~~~~~~~~~~~~~
+
+The `original GPU version <#build-gpu-version>`__ of LightGBM (``device_type=gpu``) is based on OpenCL.
+
+The ROCm-based version (``device_type=cuda``) is a separate implementation. Yes, the ROCm version reuses the ``device_type=cuda`` as a convenience for users.  Use this version in Linux environments with an AMD GPU.
+
+Windows
+^^^^^^^
+
+The ROCm version is not supported on Windows.
+Use the `GPU version <#build-gpu-version>`__ (``device_type=gpu``) for GPU acceleration on Windows.
+
+Linux
+^^^^^
+
+On Linux, a ROCm version of LightGBM can be built using
+
+- **CMake**, **gcc** and **ROCm**;
+- **CMake**, **Clang** and **ROCm**.
+
+Please refer to `the ROCm docs`_ for **ROCm** libraries installation.
+
+After compilation the executable and ``.so`` files will be in ``LightGBM/`` folder.
+
+gcc
+***
+
+1. Install `CMake`_, **gcc** and **ROCm**.
+
+2. Run the following commands:
+
+   .. code:: sh
+
+     git clone --recursive https://github.com/microsoft/LightGBM
+     cd LightGBM
+     cmake -B build -S . -DUSE_ROCM=ON
+     cmake --build build -j4
+
+Clang
+*****
+
+1. Install `CMake`_, **Clang**, **OpenMP** and **ROCm**.
+
+2. Run the following commands:
+
+   .. code:: sh
+
+     git clone --recursive https://github.com/microsoft/LightGBM
+     cd LightGBM
+     export CXX=clang++-14 CC=clang-14  # replace "14" with version of Clang installed on your machine
+     cmake -B build -S . -DUSE_ROCM=ON
+     cmake --build build -j4
+
+macOS
+^^^^^
+
+The ROCm version is not supported on macOS.
+
 Build Java Wrapper
 ~~~~~~~~~~~~~~~~~~
 
@@ -1054,6 +1113,8 @@ gcc
 
 .. _this detailed guide: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
 
+.. _the ROCm docs: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/
+
 .. _following docs: https://github.com/google/sanitizers/wiki
 
 .. _Ninja: https://ninja-build.org
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -264,15 +264,15 @@ Core Parameters
 
    -  ``cpu`` supports all LightGBM functionality and is portable across the widest range of operating systems and hardware
 
-   -  ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA
+   -  ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA or ROCm
 
    -  ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA
 
    -  **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
 
    -  **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
 
-   -  **Note**: refer to `Installation Guide <./Installation-Guide.rst>`__ to build LightGBM with GPU or CUDA support
+   -  **Note**: refer to `Installation Guide <./Installation-Guide.rst>`__ to build LightGBM with GPU, CUDA, or ROCm support
 
 -  ``seed`` :raw-html:`<a id="seed" title="Permalink to this parameter" href="#seed">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = int, aliases: ``random_seed``, ``random_state``
 
diff --git a/docs/_static/js/script.js b/docs/_static/js/script.js
@@ -22,6 +22,7 @@ $(() => {
             "#build-mpi-version",
             "#build-gpu-version",
             "#build-cuda-version",
+            "#build-rocm-version",
             "#build-java-wrapper",
             "#build-python-package",
             "#build-r-package",
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
@@ -246,11 +246,11 @@ struct Config {
   // alias = device
   // desc = device for the tree learning
   // desc = ``cpu`` supports all LightGBM functionality and is portable across the widest range of operating systems and hardware
-  // desc = ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA
+  // desc = ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA or ROCm
   // desc = ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA
   // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
   // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
-  // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst>`__ to build LightGBM with GPU or CUDA support
+  // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst>`__ to build LightGBM with GPU, CUDA, or ROCm support
   std::string device_type = "cpu";
 
   // [no-automatically-extract]
diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp
@@ -9,8 +9,10 @@
 
 #ifdef USE_CUDA
 
+#ifndef USE_ROCM
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #include <stdio.h>
 
 #include <LightGBM/bin.h>
diff --git a/include/LightGBM/cuda/cuda_random.hpp b/include/LightGBM/cuda/cuda_random.hpp
@@ -7,8 +7,10 @@
 
 #ifdef USE_CUDA
 
+#ifndef USE_ROCM
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 
 namespace LightGBM {
 
diff --git a/include/LightGBM/cuda/cuda_rocm_interop.h b/include/LightGBM/cuda/cuda_rocm_interop.h
@@ -7,16 +7,59 @@
 
 #ifdef USE_CUDA
 
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
-// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask.
+#if defined(__HIP_PLATFORM_AMD__)
+
+// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
+#define atomicAdd_block atomicAdd
+
+// hipify
+#include <hip/hip_runtime.h>
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorName hipGetErrorName
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostAlloc hipHostAlloc
+#define cudaHostAllocPortable hipHostAllocPortable
+#define cudaMalloc hipMalloc
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemoryTypeHost hipMemoryTypeHost
+#define cudaMemset hipMemset
+#define cudaPointerAttributes hipPointerAttribute_t
+#define cudaPointerGetAttributes hipPointerGetAttributes
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+
+// ROCm 7.0 did add __shfl_down_sync et al, but the following hack still works.
 // Since mask is full 0xffffffff, we can use __shfl_down instead.
 #define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset)
 #define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset)
-// ROCm warpSize is constexpr and is either 32 or 64 depending on gfx arch.
-#define WARPSIZE warpSize
-// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
-#define atomicAdd_block atomicAdd
-#else
+
+// warpSize is only allowed for device code.
+// HIP header used to define warpSize as a constexpr that was either 32 or 64
+// depending on the target device, and then always set it to 64 for host code.
+static inline constexpr int WARP_SIZE_INTERNAL() {
+#if defined(__GFX9__)
+  return 64;
+#else  // __GFX9__
+  return 32;
+#endif  // __GFX9__
+}
+#define WARPSIZE (WARP_SIZE_INTERNAL())
+
+#else  // __HIP_PLATFORM_AMD__
 // CUDA warpSize is not a constexpr, but always 32
 #define WARPSIZE 32
 #endif  // defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
diff --git a/include/LightGBM/cuda/cuda_utils.hu b/include/LightGBM/cuda/cuda_utils.hu
@@ -8,8 +8,12 @@
 
 #ifdef USE_CUDA
 
+#if defined(USE_ROCM)
+#include <LightGBM/cuda/cuda_rocm_interop.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #include <stdio.h>
 #include <nccl.h>
 
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
@@ -9,9 +9,12 @@
 #include <LightGBM/utils/common.h>
 
 #ifdef USE_CUDA
+#ifndef USE_ROCM
 #include <cuda.h>
 #include <cuda_runtime.h>
-#endif
+#endif  // USE_ROCM
+#include <LightGBM/cuda/cuda_utils.hu>
+#endif  // USE_CUDA
 #include <stdio.h>
 
 enum LGBM_Device {
@@ -66,14 +69,14 @@ struct CHAllocator {
     #ifdef USE_CUDA
       if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaPointerAttributes attributes;
-        cudaPointerGetAttributes(&attributes, p);
-        #if CUDA_VERSION >= 10000
+        CUDASUCCESS_OR_FATAL(cudaPointerGetAttributes(&attributes, p));
+        #if CUDA_VERSION >= 10000 || defined(USE_ROCM)
           if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
-            cudaFreeHost(p);
+            CUDASUCCESS_OR_FATAL(cudaFreeHost(p));
           }
         #else
           if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
-            cudaFreeHost(p);
+            CUDASUCCESS_OR_FATAL(cudaFreeHost(p));
           }
         #endif
       } else {
diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp
@@ -5,6 +5,7 @@
 
 #ifdef USE_CUDA
 
+#include <LightGBM/cuda/cuda_rocm_interop.h>
 #include <LightGBM/cuda/cuda_utils.hu>
 
 namespace LightGBM {
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu
@@ -940,7 +940,11 @@ __global__ void FindBestSplitsDiscretizedForLeafKernel(
   if (is_feature_used_bytree[inner_feature_index]) {
     if (task->is_categorical) {
       __threadfence();  // ensure store issued before trap
+#if defined(USE_ROCM)
+      __builtin_trap();
+#else
       asm("trap;");
+#endif
     } else {
       if (!task->reverse) {
         if (use_16bit_bin) {
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
@@ -172,7 +172,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
     #pragma warning(disable : 4702)
     explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) {
       Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA=1");
+                 "Please recompile with CMake option -DUSE_CUDA=1 (NVIDIA GPUs) or -DUSE_ROCM=1 (AMD GPUs)");
     }
 };