Skip to content

Commit 36ca157

Browse files
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into cycliclr
2 parents 5ebf5c1 + c4b7c48 commit 36ca157

File tree

628 files changed

+18674
-6169
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

628 files changed

+18674
-6169
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME"
6060
# Note(zhouwei): It use option above, so put here
6161
include(init)
6262
include(generic) # simplify cmake module
63+
include(experimental) # experimental build options
6364

6465
if (WITH_GPU AND WITH_XPU)
6566
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")

cmake/cblas.cmake

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ if(NOT DEFINED CBLAS_PROVIDER)
5252
set(OPENBLAS_INCLUDE_SEARCH_PATHS
5353
${OPENBLAS_ROOT}/include
5454
/usr/include
55+
/usr/include/lapacke
5556
/usr/include/openblas
5657
/usr/local/opt/openblas/include)
5758
set(OPENBLAS_LIB_SEARCH_PATHS
@@ -65,15 +66,17 @@ if(NOT DEFINED CBLAS_PROVIDER)
6566
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
6667
find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
6768
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
69+
find_path(OPENBLAS_CONFIG_INC_DIR NAMES openblas_config.h
70+
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
6871
find_library(OPENBLAS_LIB NAMES openblas
6972
PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
7073

71-
if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
72-
file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
74+
if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_CONFIG_INC_DIR AND OPENBLAS_LIB)
75+
file(READ "${OPENBLAS_CONFIG_INC_DIR}/openblas_config.h" config_file)
7376
string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
7477
string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
7578

76-
if (${ver} VERSION_GREATER_EQUAL "0.3.7")
79+
if (${ver} VERSION_GREATER_EQUAL "0.3.5")
7780
set(CBLAS_PROVIDER OPENBLAS)
7881
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
7982
set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
@@ -138,4 +141,3 @@ if(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
138141
elseif(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
139142
target_link_libraries(cblas ${CBLAS_LIBRARIES})
140143
endif()
141-

cmake/experimental.cmake

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# this file contains experimental build options
16+
17+
include(experiments/cuda_module_loading_lazy)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# this file contains experimental build options for lazy cuda module loading
16+
# cuda moduel lazy loading is supported by CUDA 11.6+
17+
# this experiment option makes Paddle supports lazy loading before CUDA 11.6.
18+
19+
option(EXP_CUDA_MODULE_LOADING_LAZY "enable lazy cuda module loading" OFF)
20+
if (${EXP_CUDA_MODULE_LOADING_LAZY})
21+
if (NOT ${ON_INFER} OR NOT ${LINUX})
22+
message("EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms")
23+
return()
24+
endif ()
25+
if (NOT ${CUDA_FOUND})
26+
message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
27+
return()
28+
endif ()
29+
if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6")
30+
message("cuda 11.6+ already support lazy module loading")
31+
return()
32+
endif ()
33+
34+
message("for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a")
35+
set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "" FORCE)
36+
set(CMAKE_CUDA_FLAGS "--cudart shared")
37+
enable_language(CUDA)
38+
set(CUDA_NVCC_EXECUTABLE "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
39+
set(CMAKE_CUDA_COMPILER "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
40+
endif()

cmake/external/xpu.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@ SET(XPU_RT_LIB_NAME "libxpurt.so")
99

1010
if(NOT DEFINED XPU_BASE_URL)
1111
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
12-
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220511")
12+
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220520")
1313
else()
1414
SET(XPU_BASE_URL "${XPU_BASE_URL}")
1515
endif()
1616

1717
# ubuntu and centos: use output by XDNN API team
1818
if(NOT DEFINED XPU_XDNN_BASE_URL)
1919
SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
20-
SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220511")
20+
SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220520")
2121
else()
2222
SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
2323
endif()

cmake/flags.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,10 @@ set(COMMON_FLAGS
142142
-Wno-unused-function
143143
-Wno-error=literal-suffix
144144
-Wno-error=unused-local-typedefs
145-
-Wno-error=parentheses-equality # Warnings in pybind11
146145
-Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
147146
-Wno-error=terminate # Warning in PADDLE_ENFORCE
148147
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
149148
-Wimplicit-fallthrough=0 # Warning in tinyformat.h
150-
-Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
151149
${fsanitize}
152150
)
153151

paddle/fluid/distributed/collective/ProcessGroup.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,19 @@ class ProcessGroup {
113113
"ProcessGroup%s does not support receive", GetBackendName()));
114114
}
115115

116+
virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor&,
117+
int, int,
118+
int) { // NOLINT
119+
PADDLE_THROW(platform::errors::InvalidArgument(
120+
"ProcessGroup%s does not support send", GetBackendName()));
121+
}
122+
123+
virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
124+
phi::DenseTensor& tensors, int, int, int) { // NOLINT
125+
PADDLE_THROW(platform::errors::InvalidArgument(
126+
"ProcessGroup%s does not support receive", GetBackendName()));
127+
}
128+
116129
virtual std::shared_ptr<ProcessGroup::Task> AllGather(
117130
std::vector<phi::DenseTensor>&, // NOLINT
118131
std::vector<phi::DenseTensor>&) { // NOLINT

paddle/fluid/distributed/collective/ProcessGroupNCCL.cc

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,53 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
428428
return task;
429429
}
430430

431+
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
432+
phi::DenseTensor& tensors, int dst_rank, int offset, int length) {
433+
// CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
434+
435+
phi::DenseTensor flatten_tensor;
436+
flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
437+
438+
phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
439+
440+
std::vector<phi::DenseTensor> shared_tensors;
441+
shared_tensors.push_back(shared_input);
442+
443+
auto task = PointToPoint(shared_tensors,
444+
[&](phi::DenseTensor& input, ncclComm_t comm,
445+
const gpuStream_t& stream, int dst_rank) {
446+
return platform::dynload::ncclSend(
447+
input.data(), input.numel(),
448+
platform::ToNCCLDataType(input.dtype()),
449+
dst_rank, comm, stream);
450+
},
451+
dst_rank, CommType::SEND);
452+
return task;
453+
}
454+
455+
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
456+
phi::DenseTensor& tensors, int src_rank, int offset, int length) {
457+
// phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
458+
459+
phi::DenseTensor flatten_tensor;
460+
flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
461+
phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
462+
463+
std::vector<phi::DenseTensor> shared_tensors;
464+
shared_tensors.push_back(shared_input);
465+
466+
auto task = PointToPoint(shared_tensors,
467+
[&](phi::DenseTensor& output, ncclComm_t comm,
468+
const gpuStream_t& stream, int src_rank) {
469+
return platform::dynload::ncclRecv(
470+
output.data(), output.numel(),
471+
platform::ToNCCLDataType(output.dtype()),
472+
src_rank, comm, stream);
473+
},
474+
src_rank, CommType::RECV);
475+
return task;
476+
}
477+
431478
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
432479
std::vector<phi::DenseTensor>& in_tensors,
433480
std::vector<phi::DenseTensor>& out_tensors) {

paddle/fluid/distributed/collective/ProcessGroupNCCL.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,14 @@ class ProcessGroupNCCL : public ProcessGroup {
102102
std::shared_ptr<ProcessGroup::Task> Recv(
103103
std::vector<phi::DenseTensor>& tensors, int src_rank) override;
104104

105+
std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
106+
int dst_rank, int offset,
107+
int length) override;
108+
109+
std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
110+
int src_rank, int offset,
111+
int length) override;
112+
105113
std::shared_ptr<ProcessGroup::Task> AllGather(
106114
std::vector<phi::DenseTensor>& in_tensors,
107115
std::vector<phi::DenseTensor>& out_tensors) override;

paddle/fluid/distributed/fleet_executor/dist_model.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -546,9 +546,9 @@ bool DistModel::Run(const std::vector<DistModelTensor> &input_data,
546546

547547
DistModelTimer timer;
548548
timer.tic();
549-
double feed_elapse;
550-
double fleet_exe_elapse;
551-
double fetch_elapse;
549+
double feed_elapse = 0;
550+
double fleet_exe_elapse = 0;
551+
double fetch_elapse = 0;
552552

553553
if (!FeedData(input_data, scope_.get())) {
554554
LOG(ERROR) << "DistModel failed at feeding data.";

0 commit comments

Comments
 (0)