PaddlePaddle · typhoonzero · Jun 14, 2018 · Jun 11, 2018 · Jun 12, 2018 · Jun 13, 2018
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -63,7 +64,7 @@ ParallelExecutor::ParallelExecutor(
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
 
-  // Step 1. Bcast the params to devs.
+  // Step 2. Bcast the params to devs.
   // Create local scopes
   if (local_scopes.empty()) {
     member_->own_local_scope_ = true;
@@ -99,7 +100,7 @@ ParallelExecutor::ParallelExecutor(
   }
   // Startup Program has been run. All local scopes has correct parameters.
 
-  // Step 2. Create vars in each scope;
+  // Step 3. Create vars in each scope;
   std::vector<details::VariableInfo> var_infos;
   for (auto *var : main_program.Block(0).AllVars()) {
     var_infos.emplace_back();
@@ -108,7 +109,7 @@ ParallelExecutor::ParallelExecutor(
     var_infos.back().persistable_ = var->Persistable();
   }
 
-  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+  // Step 4. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
 
   details::SSAGraphBuilderFactory builder_factory(
@@ -145,9 +146,9 @@ void ParallelExecutor::BCastParamsToGPUs(
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
+      std::vector<void *> buffers;
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      platform::NCCLGroupGuard guard;
       for (size_t i = 0; i < member_->places_.size(); ++i) {
         auto place = member_->places_[i];
         void *buffer;
@@ -159,11 +160,21 @@ void ParallelExecutor::BCastParamsToGPUs(
           t->Resize(dims);
           buffer = t->mutable_data(place, main_tensor.type());
         }
-        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                     nccl_ctx.comm_, nccl_ctx.stream());
+        buffers.push_back(buffer);
       }
-      member_->nccl_ctxs_->WaitAll();
+
+      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
+                        "variables' buffer size to bcast NOT equal to places");
+      {
+        platform::NCCLGroupGuard guard;
+        for (size_t i = 0; i < member_->places_.size(); ++i) {
+          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
+          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
+        }
+        member_->nccl_ctxs_->WaitAll();
+      }
+
 #else
       PADDLE_THROW("Not compiled with CUDA");
 #endif

diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
   }
 }
 
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// NCCL actions when use it.
 class NCCLGroupGuard {
  public:
   static std::mutex &NCCLMutex() {