-
Notifications
You must be signed in to change notification settings - Fork 5.9k
Fix NCCLBcast hang up bug in Parallel Executor #11377
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
2dde5f7
fe520d1
15afefe
212bdd6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ limitations under the License. */ | |
| #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" | ||
| #include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" | ||
| #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" | ||
| #include "paddle/fluid/memory/malloc.h" | ||
| #include "paddle/fluid/platform/profiler.h" | ||
|
|
||
| namespace paddle { | ||
|
|
@@ -63,7 +64,7 @@ ParallelExecutor::ParallelExecutor( | |
| member_->global_scope_ = scope; | ||
| member_->use_cuda_ = exec_strategy.use_cuda_; | ||
|
|
||
| // Step 1. Bcast the params to devs. | ||
| // Step 2. Bcast the params to devs. | ||
|
||
| // Create local scopes | ||
| if (local_scopes.empty()) { | ||
| member_->own_local_scope_ = true; | ||
|
|
@@ -99,7 +100,7 @@ ParallelExecutor::ParallelExecutor( | |
| } | ||
| // Startup Program has been run. All local scopes has correct parameters. | ||
|
|
||
| // Step 2. Create vars in each scope; | ||
| // Step 3. Create vars in each scope; | ||
| std::vector<details::VariableInfo> var_infos; | ||
| for (auto *var : main_program.Block(0).AllVars()) { | ||
| var_infos.emplace_back(); | ||
|
|
@@ -108,7 +109,7 @@ ParallelExecutor::ParallelExecutor( | |
| var_infos.back().persistable_ = var->Persistable(); | ||
| } | ||
|
|
||
| // Step 3. Convert main_program to SSA form and dependency graph. Also, insert | ||
| // Step 4. Convert main_program to SSA form and dependency graph. Also, insert | ||
| // ncclOp | ||
|
|
||
| details::SSAGraphBuilderFactory builder_factory( | ||
|
|
@@ -145,9 +146,9 @@ void ParallelExecutor::BCastParamsToGPUs( | |
| auto &dims = main_tensor.dims(); | ||
| if (paddle::platform::is_gpu_place(main_tensor.place())) { | ||
| #ifdef PADDLE_WITH_CUDA | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think the modify of line159~167 is necessary.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, the thrown exception will not be handled properly, this PR was submitted to fix this bug |
||
| std::vector<void *> buffers; | ||
| size_t numel = main_tensor.numel(); | ||
| ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); | ||
| platform::NCCLGroupGuard guard; | ||
| for (size_t i = 0; i < member_->places_.size(); ++i) { | ||
| auto place = member_->places_[i]; | ||
| void *buffer; | ||
|
|
@@ -159,11 +160,21 @@ void ParallelExecutor::BCastParamsToGPUs( | |
| t->Resize(dims); | ||
| buffer = t->mutable_data(place, main_tensor.type()); | ||
| } | ||
| auto &nccl_ctx = member_->nccl_ctxs_->at(place); | ||
| platform::dynload::ncclBcast(buffer, numel, data_type, 0, | ||
| nccl_ctx.comm_, nccl_ctx.stream()); | ||
| buffers.push_back(buffer); | ||
| } | ||
| member_->nccl_ctxs_->WaitAll(); | ||
|
|
||
| PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), | ||
| "variables' buffer size to bcast NOT equal to places"); | ||
| { | ||
| platform::NCCLGroupGuard guard; | ||
| for (size_t i = 0; i < member_->places_.size(); ++i) { | ||
| auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]); | ||
| platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, | ||
| nccl_ctx.comm_, nccl_ctx.stream()); | ||
| } | ||
| member_->nccl_ctxs_->WaitAll(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line may not needed? since
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chengduoZH Can you please help take a look at this?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @typhoonzero line 185 is necessary,
|
||
| } | ||
|
|
||
| #else | ||
| PADDLE_THROW("Not compiled with CUDA"); | ||
| #endif | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { | |
| } | ||
| } | ||
|
|
||
| // NOTE(minqiyang): according to the ncclGroupEnd documentations: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, I think we can assume people who develop paddlepaddle with this file is familiar with the NCCL natives.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To avoid this bug happens again, I leave these comments. |
||
| // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, | ||
| // ncclGroupEnd will wait for all communicators to be initialized, which will | ||
| // cause blocking problem when a runtime_error was thrown, so try only guard | ||
| // NCCL actions when use it. | ||
| class NCCLGroupGuard { | ||
| public: | ||
| static std::mutex &NCCLMutex() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove this line.