Skip to content

Commit 569f89a

Browse files
[CUDA] Multi-GPU for CUDA Version (#6138)
* initialize nccl * change year in header * add implementation of nccl gbdt * add nccl topology * clean up * clean up * set nccl info * support quantized training with categorical features on cpu * remove white spaces * add tests for quantized training with categorical features * skip tests for cuda version * fix cases when only 1 data block in row-wise quantized histogram construction with 8 inner bits * remove useless capture * fix inconsistency of gpu devices * fix creating boosting object from file * change num_gpu to num_gpus in test case * fix objective initialization fix lint errors * fix c++ compilation warning * fix lint errors * fix compilation warnings * change num_gpu to num_gpus in R test case * add nccl synchronization in tree training * fix global num data update * fix ruff-format issues * use global num data in split finder * explicit initialization of NCCLInfo members * fix compilation * use CUDAVector * use CUDAVector * merge master * use CUDAVector * use CUDAVector for cuda tree and column data * update gbdt * changes for cuda tree * use CUDAVector for cuda column data * disable cuda by default * fix single machine gbdt * clean up * fix typo * fix lint issues * use num_gpu instead of num_gpus * fix compilation error * fix cpp lint errors * fix reset config for cuda data partition * fix subrow copy in cuda column data * fix cmakelint errors * Update include/LightGBM/config.h Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * Update include/LightGBM/config.h Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * Update include/LightGBM/cuda/cuda_nccl_topology.hpp Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * Update include/LightGBM/config.h Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * Update src/treelearner/cuda/cuda_data_partition.cu Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * Update src/treelearner/cuda/cuda_data_partition.cu Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * Update src/treelearner/cuda/cuda_leaf_splits.cu Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * remove WARPSIZE before #6086 is merged * Update src/treelearner/cuda/cuda_leaf_splits.cu Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * update docs * Update src/treelearner/cuda/cuda_leaf_splits.cu Co-authored-by: Nikita Titov <nekit94-08@mail.ru> * update documentation to indicate supporting of multi-node multi-gpu training of CUDA version * add header guard * update document for parameters * fix lint errors * fix header ordering * update Nccl to NCCL --------- Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
1 parent 55a2696 commit 569f89a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+2101
-1013
lines changed

.ci/check-python-dists.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ if [ "$PY_MINOR_VER" -gt 7 ]; then
3232
--inspect \
3333
--ignore 'compiled-objects-have-debug-symbols'\
3434
--ignore 'distro-too-large-compressed' \
35-
--max-allowed-size-uncompressed '120M' \
35+
--max-allowed-size-uncompressed '500M' \
3636
--max-allowed-files 800 \
3737
"$(echo "${DIST_DIR}"/*)" || exit 1
3838
elif { test "$(uname -m)" = "aarch64"; }; then

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ endif()
217217

218218
if(USE_CUDA)
219219
find_package(CUDAToolkit 11.0 REQUIRED)
220+
find_package(NCCL REQUIRED)
220221
include_directories(${CUDAToolkit_INCLUDE_DIRS})
221222
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
222223

@@ -438,6 +439,7 @@ set(
438439
LGBM_CUDA_SOURCES
439440
src/boosting/cuda/cuda_score_updater.cpp
440441
src/boosting/cuda/cuda_score_updater.cu
442+
src/boosting/cuda/nccl_gbdt.cpp
441443
src/metric/cuda/cuda_binary_metric.cpp
442444
src/metric/cuda/cuda_pointwise_metric.cpp
443445
src/metric/cuda/cuda_regression_metric.cpp
@@ -588,6 +590,10 @@ if(USE_GPU)
588590
target_link_libraries(lightgbm_objs PUBLIC ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
589591
endif()
590592

593+
if(USE_CUDA)
594+
target_link_libraries(lightgbm_objs PUBLIC ${NCCL_LIBRARY})
595+
endif()
596+
591597
if(__INTEGRATE_OPENCL)
592598
# targets OpenCL and Boost are added in IntegratedOpenCL.cmake
593599
add_dependencies(lightgbm_objs OpenCL Boost)

cmake/modules/FindNCCL.cmake

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
#
14+
# Tries to find NCCL headers and libraries.
15+
#
16+
# Usage of this module as follows:
17+
#
18+
# find_package(NCCL)
19+
#
20+
# Variables used by this module, they can change the default behaviour and need
21+
# to be set before calling find_package:
22+
#
23+
# NCCL_ROOT - When set, this path is inspected instead of standard library
24+
# locations as the root of the NCCL installation.
25+
# The environment variable NCCL_ROOT overrides this variable.
26+
#
27+
# This module defines
28+
# NCCL_FOUND, whether nccl has been found
29+
# NCCL_INCLUDE_DIR, directory containing header
30+
# NCCL_LIBRARY, directory containing nccl library
31+
# NCCL_LIB_NAME, nccl library name
32+
# USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
33+
# location of the nccl library. This would disable
34+
# switching between static and shared.
35+
#
36+
# This module assumes that the user has already called find_package(CUDA)
37+
38+
if(NCCL_LIBRARY)
39+
if(NOT USE_NCCL_LIB_PATH)
40+
# Don't cache NCCL_LIBRARY to enable switching between static and shared.
41+
unset(NCCL_LIBRARY CACHE)
42+
endif()
43+
endif()
44+
45+
if(BUILD_WITH_SHARED_NCCL)
46+
# libnccl.so
47+
set(NCCL_LIB_NAME nccl)
48+
else()
49+
# libnccl_static.a
50+
set(NCCL_LIB_NAME nccl_static)
51+
endif()
52+
53+
find_path(NCCL_INCLUDE_DIR
54+
NAMES nccl.h
55+
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
56+
57+
find_library(NCCL_LIBRARY
58+
NAMES ${NCCL_LIB_NAME}
59+
PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
60+
61+
message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
62+
63+
include(FindPackageHandleStandardArgs)
64+
find_package_handle_standard_args(NCCL DEFAULT_MSG
65+
NCCL_INCLUDE_DIR NCCL_LIBRARY)
66+
67+
mark_as_advanced(
68+
NCCL_INCLUDE_DIR
69+
NCCL_LIBRARY
70+
)

docs/Installation-Guide.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -692,9 +692,9 @@ Refer to `GPU Docker folder <https://github.com/microsoft/LightGBM/tree/master/d
692692
Build CUDA Version
693693
~~~~~~~~~~~~~~~~~~
694694

695-
The `original GPU version <#build-gpu-version>`__ of LightGBM (``device_type=gpu``) is based on OpenCL.
695+
The `original GPU version <#build-gpu-version>`__ of LightGBM (``device_type=gpu``) is based on OpenCL, and only computes histograms on GPUs, with other parts of training in CPUs.
696696

697-
The CUDA-based version (``device_type=cuda``) is a separate implementation.
697+
The CUDA-based version (``device_type=cuda``) is a separate implementation that runs significantly faster by putting all the training process on GPUs. It also supports multi-GPU, and multi-node multi-GPU training.
698698
Use this version in Linux environments with an NVIDIA GPU with compute capability 6.0 or higher.
699699

700700
Windows

docs/Parameters.rst

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1373,8 +1373,18 @@ GPU Parameters
13731373

13741374
- ``-1`` means the default device in the selected platform
13751375

1376+
- in multi-GPU case (``num_gpu>1``) means ID of the master GPU
1377+
13761378
- **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
13771379

1380+
- ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
1381+
1382+
- list of CUDA device IDs
1383+
1384+
- **Note**: can be used only in CUDA implementation (``device_type="cuda"``) and when ``num_gpu>1``
1385+
1386+
- if empty, the devices with the smallest IDs will be used
1387+
13781388
- ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
13791389

13801390
- set this to ``true`` to use double precision math on GPU (by default single precision is used)
@@ -1383,10 +1393,16 @@ GPU Parameters
13831393

13841394
- ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
13851395

1386-
- number of GPUs
1396+
- number of GPUs used for training in this node
13871397

13881398
- **Note**: can be used only in CUDA implementation (``device_type="cuda"``)
13891399

1400+
- if ``0``, only 1 GPU will be used
1401+
1402+
- used in both single-machine and distributed learning applications
1403+
1404+
- in distributed learning application, each machine can use different number of GPUs
1405+
13901406
.. end params list
13911407
13921408
Others

include/LightGBM/boosting.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
309309
* \param format Format of model
310310
* \param config config for boosting
311311
* \param filename name of model file, if existing will continue to train from this model
312+
* \param device_type type of device, can be cpu, gpu or cuda
313+
* \param num_gpu number of GPUs to use
312314
* \return The boosting object
313315
*/
314-
static Boosting* CreateBoosting(const std::string& type, const char* filename);
316+
static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);
315317

316318
virtual std::string GetLoadedParam() const = 0;
317319

include/LightGBM/config.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1125,16 +1125,25 @@ struct Config {
11251125

11261126
// desc = OpenCL device ID in the specified platform or CUDA device ID. Each GPU in the selected platform has a unique device ID
11271127
// desc = ``-1`` means the default device in the selected platform
1128+
// desc = in multi-GPU case (``num_gpu>1``) means ID of the master GPU
11281129
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
11291130
int gpu_device_id = -1;
11301131

1132+
// desc = list of CUDA device IDs
1133+
// desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``) and when ``num_gpu>1``
1134+
// desc = if empty, the devices with the smallest IDs will be used
1135+
std::string gpu_device_id_list = "";
1136+
11311137
// desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
11321138
// desc = **Note**: can be used only in OpenCL implementation (``device_type="gpu"``), in CUDA implementation only double precision is currently supported
11331139
bool gpu_use_dp = false;
11341140

11351141
// check = >0
1136-
// desc = number of GPUs
1142+
// desc = number of GPUs used for training in this node
11371143
// desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``)
1144+
// desc = if ``0``, only 1 GPU will be used
1145+
// desc = used in both single-machine and distributed learning applications
1146+
// desc = in distributed learning application, each machine can use different number of GPUs
11381147
int num_gpu = 1;
11391148

11401149
#ifndef __NVCC__

include/LightGBM/cuda/cuda_column_data.hpp

Lines changed: 39 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <LightGBM/bin.h>
1414
#include <LightGBM/utils/openmp_wrapper.h>
1515

16+
#include <memory>
1617
#include <cstdint>
1718
#include <vector>
1819

@@ -39,11 +40,11 @@ class CUDAColumnData {
3940
const std::vector<uint8_t>& feature_mfb_is_na,
4041
const std::vector<int>& feature_to_column);
4142

42-
const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]; }
43+
const uint8_t* GetColumnData(const int column_index) const { return data_by_column_[column_index]->RawData(); }
4344

4445
void CopySubrow(const CUDAColumnData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices);
4546

46-
void* const* cuda_data_by_column() const { return cuda_data_by_column_; }
47+
uint8_t* const* cuda_data_by_column() const { return cuda_data_by_column_.RawData(); }
4748

4849
uint32_t feature_min_bin(const int feature_index) const { return feature_min_bin_[feature_index]; }
4950

@@ -63,42 +64,50 @@ class CUDAColumnData {
6364

6465
uint8_t feature_mfb_is_na(const int feature_index) const { return feature_mfb_is_na_[feature_index]; }
6566

66-
const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_; }
67+
const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_.RawData(); }
6768

68-
const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_; }
69+
const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_.RawData(); }
6970

70-
const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_; }
71+
const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_.RawData(); }
7172

72-
const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_; }
73+
const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_.RawData(); }
7374

74-
const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_; }
75+
const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_.RawData(); }
7576

76-
const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_; }
77+
const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_.RawData(); }
7778

78-
const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_; }
79+
const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_.RawData(); }
7980

80-
const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_; }
81+
const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_.RawData(); }
8182

82-
const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_; }
83+
const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_.RawData(); }
8384

84-
const int* cuda_feature_to_column() const { return cuda_feature_to_column_; }
85+
const int* cuda_feature_to_column() const { return cuda_feature_to_column_.RawData(); }
8586

86-
const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_; }
87+
const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_.RawData(); }
8788

8889
int feature_to_column(const int feature_index) const { return feature_to_column_[feature_index]; }
8990

9091
uint8_t column_bit_type(const int column_index) const { return column_bit_type_[column_index]; }
9192

9293
private:
9394
template <bool IS_SPARSE, bool IS_4BIT, typename BIN_TYPE>
94-
void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer);
95+
void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, CUDAVector<uint8_t>* out_column_data_pointer);
9596

96-
void LaunchCopySubrowKernel(void* const* in_cuda_data_by_column);
97+
void LaunchCopySubrowKernel(uint8_t* const* in_cuda_data_by_column);
9798

9899
void InitColumnMetaInfo();
99100

100101
void ResizeWhenCopySubrow(const data_size_t num_used_indices);
101102

103+
std::vector<uint8_t*> GetDataByColumnPointers(const std::vector<std::unique_ptr<CUDAVector<uint8_t>>>& data_by_column) const {
104+
std::vector<uint8_t*> data_by_column_pointers(data_by_column.size(), nullptr);
105+
for (size_t i = 0; i < data_by_column.size(); ++i) {
106+
data_by_column_pointers[i] = reinterpret_cast<uint8_t*>(data_by_column[i]->RawData());
107+
}
108+
return data_by_column_pointers;
109+
}
110+
102111
int gpu_device_id_;
103112
int num_threads_;
104113
data_size_t num_data_;
@@ -113,24 +122,24 @@ class CUDAColumnData {
113122
std::vector<uint8_t> feature_missing_is_na_;
114123
std::vector<uint8_t> feature_mfb_is_zero_;
115124
std::vector<uint8_t> feature_mfb_is_na_;
116-
void** cuda_data_by_column_;
125+
CUDAVector<uint8_t*> cuda_data_by_column_;
117126
std::vector<int> feature_to_column_;
118-
std::vector<void*> data_by_column_;
119-
120-
uint8_t* cuda_column_bit_type_;
121-
uint32_t* cuda_feature_min_bin_;
122-
uint32_t* cuda_feature_max_bin_;
123-
uint32_t* cuda_feature_offset_;
124-
uint32_t* cuda_feature_most_freq_bin_;
125-
uint32_t* cuda_feature_default_bin_;
126-
uint8_t* cuda_feature_missing_is_zero_;
127-
uint8_t* cuda_feature_missing_is_na_;
128-
uint8_t* cuda_feature_mfb_is_zero_;
129-
uint8_t* cuda_feature_mfb_is_na_;
130-
int* cuda_feature_to_column_;
127+
std::vector<std::unique_ptr<CUDAVector<uint8_t>>> data_by_column_;
128+
129+
CUDAVector<uint8_t> cuda_column_bit_type_;
130+
CUDAVector<uint32_t> cuda_feature_min_bin_;
131+
CUDAVector<uint32_t> cuda_feature_max_bin_;
132+
CUDAVector<uint32_t> cuda_feature_offset_;
133+
CUDAVector<uint32_t> cuda_feature_most_freq_bin_;
134+
CUDAVector<uint32_t> cuda_feature_default_bin_;
135+
CUDAVector<uint8_t> cuda_feature_missing_is_zero_;
136+
CUDAVector<uint8_t> cuda_feature_missing_is_na_;
137+
CUDAVector<uint8_t> cuda_feature_mfb_is_zero_;
138+
CUDAVector<uint8_t> cuda_feature_mfb_is_na_;
139+
CUDAVector<int> cuda_feature_to_column_;
131140

132141
// used when bagging with subset
133-
data_size_t* cuda_used_indices_;
142+
CUDAVector<data_size_t> cuda_used_indices_;
134143
data_size_t num_used_indices_;
135144
data_size_t cur_subset_buffer_size_;
136145
};

include/LightGBM/cuda/cuda_metadata.hpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,20 +35,20 @@ class CUDAMetadata {
3535

3636
void SetInitScore(const double* init_score, data_size_t len);
3737

38-
const label_t* cuda_label() const { return cuda_label_; }
38+
const label_t* cuda_label() const { return cuda_label_.RawData(); }
3939

40-
const label_t* cuda_weights() const { return cuda_weights_; }
40+
const label_t* cuda_weights() const { return cuda_weights_.RawData(); }
4141

42-
const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_; }
42+
const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_.RawData(); }
4343

44-
const label_t* cuda_query_weights() const { return cuda_query_weights_; }
44+
const label_t* cuda_query_weights() const { return cuda_query_weights_.RawData(); }
4545

4646
private:
47-
label_t* cuda_label_;
48-
label_t* cuda_weights_;
49-
data_size_t* cuda_query_boundaries_;
50-
label_t* cuda_query_weights_;
51-
double* cuda_init_score_;
47+
CUDAVector<label_t> cuda_label_;
48+
CUDAVector<label_t> cuda_weights_;
49+
CUDAVector<data_size_t> cuda_query_boundaries_;
50+
CUDAVector<label_t> cuda_query_weights_;
51+
CUDAVector<double> cuda_init_score_;
5252
};
5353

5454
} // namespace LightGBM

0 commit comments

Comments
 (0)