lightgbm-org
diff --git a/‎.ci/check-python-dists.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/check-python-dists.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cmake/modules/FindNCCL.cmake‎
Lines changed: 70 additions & 0 deletions b/‎cmake/modules/FindNCCL.cmake‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎docs/Installation-Guide.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/Installation-Guide.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/Parameters.rst‎
Lines changed: 17 additions & 1 deletion b/‎docs/Parameters.rst‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎include/LightGBM/boosting.h‎
Lines changed: 3 additions & 1 deletion b/‎include/LightGBM/boosting.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎include/LightGBM/config.h‎
Lines changed: 10 additions & 1 deletion b/‎include/LightGBM/config.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎include/LightGBM/cuda/cuda_column_data.hpp‎
Lines changed: 39 additions & 30 deletions b/‎include/LightGBM/cuda/cuda_column_data.hpp‎
Lines changed: 39 additions & 30 deletions
diff --git a/‎include/LightGBM/cuda/cuda_metadata.hpp‎
Lines changed: 9 additions & 9 deletions b/‎include/LightGBM/cuda/cuda_metadata.hpp‎
Lines changed: 9 additions & 9 deletions
@@ -32,7 +32,7 @@ if [ "$PY_MINOR_VER" -gt 7 ]; then
             --inspect \
             --ignore 'compiled-objects-have-debug-symbols'\
             --ignore 'distro-too-large-compressed' \
-            --max-allowed-size-uncompressed '120M' \
+            --max-allowed-size-uncompressed '500M' \
             --max-allowed-files 800 \
             "$(echo "${DIST_DIR}"/*)" || exit 1
     elif { test "$(uname -m)" = "aarch64"; }; then
 
@@ -217,6 +217,7 @@ endif()
 
 if(USE_CUDA)
     find_package(CUDAToolkit 11.0 REQUIRED)
+    find_package(NCCL REQUIRED)
     include_directories(${CUDAToolkit_INCLUDE_DIRS})
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
 
@@ -438,6 +439,7 @@ set(
     LGBM_CUDA_SOURCES
       src/boosting/cuda/cuda_score_updater.cpp
       src/boosting/cuda/cuda_score_updater.cu
+      src/boosting/cuda/nccl_gbdt.cpp
       src/metric/cuda/cuda_binary_metric.cpp
       src/metric/cuda/cuda_pointwise_metric.cpp
       src/metric/cuda/cuda_regression_metric.cpp
@@ -588,6 +590,10 @@ if(USE_GPU)
   target_link_libraries(lightgbm_objs PUBLIC ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
 endif()
 
+if(USE_CUDA)
+  target_link_libraries(lightgbm_objs PUBLIC ${NCCL_LIBRARY})
+endif()
+
 if(__INTEGRATE_OPENCL)
   # targets OpenCL and Boost are added in IntegratedOpenCL.cmake
   add_dependencies(lightgbm_objs OpenCL Boost)
 
@@ -0,0 +1,70 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Tries to find NCCL headers and libraries.
+#
+# Usage of this module as follows:
+#
+#  find_package(NCCL)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  NCCL_ROOT - When set, this path is inspected instead of standard library
+#              locations as the root of the NCCL installation.
+#              The environment variable NCCL_ROOT overrides this variable.
+#
+# This module defines
+#  NCCL_FOUND, whether nccl has been found
+#  NCCL_INCLUDE_DIR, directory containing header
+#  NCCL_LIBRARY, directory containing nccl library
+#  NCCL_LIB_NAME, nccl library name
+#  USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
+#                     location of the nccl library. This would disable
+#                     switching between static and shared.
+#
+# This module assumes that the user has already called find_package(CUDA)
+
+if(NCCL_LIBRARY)
+  if(NOT USE_NCCL_LIB_PATH)
+    # Don't cache NCCL_LIBRARY to enable switching between static and shared.
+    unset(NCCL_LIBRARY CACHE)
+  endif()
+endif()
+
+if(BUILD_WITH_SHARED_NCCL)
+  # libnccl.so
+  set(NCCL_LIB_NAME nccl)
+else()
+  # libnccl_static.a
+  set(NCCL_LIB_NAME nccl_static)
+endif()
+
+find_path(NCCL_INCLUDE_DIR
+  NAMES nccl.h
+  PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
+
+find_library(NCCL_LIBRARY
+  NAMES ${NCCL_LIB_NAME}
+  PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
+
+message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG
+                                  NCCL_INCLUDE_DIR NCCL_LIBRARY)
+
+mark_as_advanced(
+  NCCL_INCLUDE_DIR
+  NCCL_LIBRARY
+)
@@ -692,9 +692,9 @@ Refer to `GPU Docker folder <https://github.com/microsoft/LightGBM/tree/master/d
 Build CUDA Version
 ~~~~~~~~~~~~~~~~~~
 
-The `original GPU version <#build-gpu-version>`__ of LightGBM (``device_type=gpu``) is based on OpenCL.
+The `original GPU version <#build-gpu-version>`__ of LightGBM (``device_type=gpu``) is based on OpenCL, and only computes histograms on GPUs, with other parts of training in CPUs.
 
-The CUDA-based version (``device_type=cuda``) is a separate implementation.
+The CUDA-based version (``device_type=cuda``) is a separate implementation that runs significantly faster by putting all the training process on GPUs. It also supports multi-GPU, and multi-node multi-GPU training.
 Use this version in Linux environments with an NVIDIA GPU with compute capability 6.0 or higher.
 
 Windows
 
@@ -1373,8 +1373,18 @@ GPU Parameters
 
    -  ``-1`` means the default device in the selected platform
 
+   -  in multi-GPU case (``num_gpu>1``) means ID of the master GPU
+
    -  **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
 
+-  ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
+
+   -  list of CUDA device IDs
+
+   -  **Note**: can be used only in CUDA implementation (``device_type="cuda"``) and when ``num_gpu>1``
+
+   -  if empty, the devices with the smallest IDs will be used
+
 -  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
 
    -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
@@ -1383,10 +1393,16 @@ GPU Parameters
 
 -  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
 
-   -  number of GPUs
+   -  number of GPUs used for training in this node
 
    -  **Note**: can be used only in CUDA implementation (``device_type="cuda"``)
 
+   -  if ``0``, only 1 GPU will be used
+
+   -  used in both single-machine and distributed learning applications
+
+   -  in distributed learning application, each machine can use different number of GPUs
+
 .. end params list
 
 Others
 
@@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
   * \param format Format of model
   * \param config config for boosting
   * \param filename name of model file, if existing will continue to train from this model
+  * \param device_type type of device, can be cpu, gpu or cuda
+  * \param num_gpu number of GPUs to use
   * \return The boosting object
   */
-  static Boosting* CreateBoosting(const std::string& type, const char* filename);
+  static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);
 
   virtual std::string GetLoadedParam() const = 0;
 
 
@@ -1125,16 +1125,25 @@ struct Config {
 
   // desc = OpenCL device ID in the specified platform or CUDA device ID. Each GPU in the selected platform has a unique device ID
   // desc = ``-1`` means the default device in the selected platform
+  // desc = in multi-GPU case (``num_gpu>1``) means ID of the master GPU
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_device_id = -1;
 
+  // desc = list of CUDA device IDs
+  // desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``) and when ``num_gpu>1``
+  // desc = if empty, the devices with the smallest IDs will be used
+  std::string gpu_device_id_list = "";
+
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   // desc = **Note**: can be used only in OpenCL implementation (``device_type="gpu"``), in CUDA implementation only double precision is currently supported
   bool gpu_use_dp = false;
 
   // check = >0
-  // desc = number of GPUs
+  // desc = number of GPUs used for training in this node
   // desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``)
+  // desc = if ``0``, only 1 GPU will be used
+  // desc = used in both single-machine and distributed learning applications
+  // desc = in distributed learning application, each machine can use different number of GPUs
   int num_gpu = 1;
 
   #ifndef __NVCC__
 
@@ -13,6 +13,7 @@
 #include <LightGBM/bin.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
+#include <memory>
 #include <cstdint>
 #include <vector>
 
@@ -39,11 +40,11 @@ class CUDAColumnData {
             const std::vector<uint8_t>& feature_mfb_is_na,
             const std::vector<int>& feature_to_column);
 
-  const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]; }
+  const uint8_t* GetColumnData(const int column_index) const { return data_by_column_[column_index]->RawData(); }
 
   void CopySubrow(const CUDAColumnData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices);
 
-  void* const* cuda_data_by_column() const { return cuda_data_by_column_; }
+  uint8_t* const* cuda_data_by_column() const { return cuda_data_by_column_.RawData(); }
 
   uint32_t feature_min_bin(const int feature_index) const { return feature_min_bin_[feature_index]; }
 
@@ -63,42 +64,50 @@ class CUDAColumnData {
 
   uint8_t feature_mfb_is_na(const int feature_index) const { return feature_mfb_is_na_[feature_index]; }
 
-  const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_; }
+  const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_.RawData(); }
 
-  const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_; }
+  const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_.RawData(); }
 
-  const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_; }
+  const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_.RawData(); }
 
-  const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_; }
+  const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_.RawData(); }
 
-  const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_; }
+  const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_.RawData(); }
 
-  const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_; }
+  const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_.RawData(); }
 
-  const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_; }
+  const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_.RawData(); }
 
-  const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_; }
+  const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_.RawData(); }
 
-  const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_; }
+  const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_.RawData(); }
 
-  const int* cuda_feature_to_column() const { return cuda_feature_to_column_; }
+  const int* cuda_feature_to_column() const { return cuda_feature_to_column_.RawData(); }
 
-  const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_; }
+  const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_.RawData(); }
 
   int feature_to_column(const int feature_index) const { return feature_to_column_[feature_index]; }
 
   uint8_t column_bit_type(const int column_index) const { return column_bit_type_[column_index]; }
 
  private:
   template <bool IS_SPARSE, bool IS_4BIT, typename BIN_TYPE>
-  void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer);
+  void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, CUDAVector<uint8_t>* out_column_data_pointer);
 
-  void LaunchCopySubrowKernel(void* const* in_cuda_data_by_column);
+  void LaunchCopySubrowKernel(uint8_t* const* in_cuda_data_by_column);
 
   void InitColumnMetaInfo();
 
   void ResizeWhenCopySubrow(const data_size_t num_used_indices);
 
+  std::vector<uint8_t*> GetDataByColumnPointers(const std::vector<std::unique_ptr<CUDAVector<uint8_t>>>& data_by_column) const {
+    std::vector<uint8_t*> data_by_column_pointers(data_by_column.size(), nullptr);
+    for (size_t i = 0; i < data_by_column.size(); ++i) {
+      data_by_column_pointers[i] = reinterpret_cast<uint8_t*>(data_by_column[i]->RawData());
+    }
+    return data_by_column_pointers;
+  }
+
   int gpu_device_id_;
   int num_threads_;
   data_size_t num_data_;
@@ -113,24 +122,24 @@ class CUDAColumnData {
   std::vector<uint8_t> feature_missing_is_na_;
   std::vector<uint8_t> feature_mfb_is_zero_;
   std::vector<uint8_t> feature_mfb_is_na_;
-  void** cuda_data_by_column_;
+  CUDAVector<uint8_t*> cuda_data_by_column_;
   std::vector<int> feature_to_column_;
-  std::vector<void*> data_by_column_;
-
-  uint8_t* cuda_column_bit_type_;
-  uint32_t* cuda_feature_min_bin_;
-  uint32_t* cuda_feature_max_bin_;
-  uint32_t* cuda_feature_offset_;
-  uint32_t* cuda_feature_most_freq_bin_;
-  uint32_t* cuda_feature_default_bin_;
-  uint8_t* cuda_feature_missing_is_zero_;
-  uint8_t* cuda_feature_missing_is_na_;
-  uint8_t* cuda_feature_mfb_is_zero_;
-  uint8_t* cuda_feature_mfb_is_na_;
-  int* cuda_feature_to_column_;
+  std::vector<std::unique_ptr<CUDAVector<uint8_t>>> data_by_column_;
+
+  CUDAVector<uint8_t> cuda_column_bit_type_;
+  CUDAVector<uint32_t> cuda_feature_min_bin_;
+  CUDAVector<uint32_t> cuda_feature_max_bin_;
+  CUDAVector<uint32_t> cuda_feature_offset_;
+  CUDAVector<uint32_t> cuda_feature_most_freq_bin_;
+  CUDAVector<uint32_t> cuda_feature_default_bin_;
+  CUDAVector<uint8_t> cuda_feature_missing_is_zero_;
+  CUDAVector<uint8_t> cuda_feature_missing_is_na_;
+  CUDAVector<uint8_t> cuda_feature_mfb_is_zero_;
+  CUDAVector<uint8_t> cuda_feature_mfb_is_na_;
+  CUDAVector<int> cuda_feature_to_column_;
 
   // used when bagging with subset
-  data_size_t* cuda_used_indices_;
+  CUDAVector<data_size_t> cuda_used_indices_;
   data_size_t num_used_indices_;
   data_size_t cur_subset_buffer_size_;
 };
 
@@ -35,20 +35,20 @@ class CUDAMetadata {
 
   void SetInitScore(const double* init_score, data_size_t len);
 
-  const label_t* cuda_label() const { return cuda_label_; }
+  const label_t* cuda_label() const { return cuda_label_.RawData(); }
 
-  const label_t* cuda_weights() const { return cuda_weights_; }
+  const label_t* cuda_weights() const { return cuda_weights_.RawData(); }
 
-  const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_; }
+  const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_.RawData(); }
 
-  const label_t* cuda_query_weights() const { return cuda_query_weights_; }
+  const label_t* cuda_query_weights() const { return cuda_query_weights_.RawData(); }
 
  private:
-  label_t* cuda_label_;
-  label_t* cuda_weights_;
-  data_size_t* cuda_query_boundaries_;
-  label_t* cuda_query_weights_;
-  double* cuda_init_score_;
+  CUDAVector<label_t> cuda_label_;
+  CUDAVector<label_t> cuda_weights_;
+  CUDAVector<data_size_t> cuda_query_boundaries_;
+  CUDAVector<label_t> cuda_query_weights_;
+  CUDAVector<double> cuda_init_score_;
 };
 
 }  // namespace LightGBM