PaddlePaddle · panyx0718 · Mar 15, 2018 · Mar 14, 2018 · chengduoZH · Mar 15, 2018
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
@@ -33,6 +34,11 @@ DEFINE_bool(check_nan_inf, false,
 
 namespace paddle {
 namespace framework {
+namespace {
+// block id starts from 0. This id is used to represent the codeblock
+// wrapping the first block 0.
+int kProgramId = -1;
+}  // namespace
 
 struct ExecutorPrepareContext {
   ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
@@ -94,6 +100,7 @@ static void CheckTensorNANOrInf(const std::string& name,
 
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
+  platform::RecordBlock b(block_id);
   auto* ctx = Prepare(pdesc, block_id);
   RunPreparedContext(ctx, scope, create_local_scope, create_vars);
   delete ctx;
@@ -184,6 +191,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                    std::map<std::string, LoDTensor*>& fetch_targets,
                    const std::string& feed_holder_name,
                    const std::string& fetch_holder_name) {
+  platform::RecordBlock b(kProgramId);
   auto* copy_program = new ProgramDesc(program);
   auto* global_block = copy_program->MutableBlock(0);
 

diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -158,11 +159,14 @@ class ParallelDoOp : public framework::OperatorBase {
       auto &place = places[place_idx];
       auto *cur_scope = sub_scopes[place_idx];
 
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
+      workers.emplace_back(
+          framework::Async([program, cur_scope, place, block, place_idx] {
+            // Give the thread an id to distinguish parallel block with same id.
+            platform::RecordThread rt(static_cast<int>(place_idx) + 1);
+            framework::Executor executor(place);
+            executor.Run(*program, cur_scope, block->ID(),
+                         false /*create_local_scope*/);
+          }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -234,11 +238,14 @@ class ParallelDoGradOp : public framework::OperatorBase {
       auto *cur_scope = sub_scopes[i];
 
       // execute
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
+      workers.emplace_back(
+          framework::Async([program, cur_scope, place, block, i] {
+            // Give the thread an id to distinguish parallel block with same id.
+            platform::RecordThread rt(static_cast<int>(i) + 1);
+            framework::Executor executor(place);
+            executor.Run(*program, cur_scope, block->ID(),
+                         false /*create_local_scope*/);
+          }));
     }
     for (auto &worker : workers) {
       worker.wait();

diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
@@ -26,8 +26,14 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace {
+// Current thread's id. Note, we don't distinguish nested threads
+// for now.
+thread_local int cur_thread_id = 0;
+// Tracking the nested block stacks of each thread.
+thread_local std::deque<int> block_id_stack;
+// Tracking the nested event stacks.
+thread_local std::deque<std::string> annotation_stack;
 
-thread_local const char *cur_annotation = nullptr;
 std::once_flag tracer_once_flag;
 DeviceTracer *tracer = nullptr;
 }  // namespace
@@ -191,19 +197,19 @@ class DeviceTracerImpl : public DeviceTracer {
     correlations_[id] = anno;
   }
 
-  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
-    if (!anno) {
-      // TODO(panyx0718): Currently, it doesn't support nested situation
-      // Up-level can be cleared by low-level and therefore get nullptr
-      // here.
+  void AddCPURecords(const std::string &anno, uint64_t start_ns,
+                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {
+    if (anno.empty()) {
+      VLOG(1) << "Empty timeline annotation.";
       return;
     }
     std::lock_guard<std::mutex> l(trace_mu_);
-    cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0});
+    cpu_records_.push_back(
+        CPURecord{anno, start_ns, end_ns, device_id, thread_id});
   }
 
   void AddMemRecords(const std::string &name, uint64_t start_ns,
-                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint64_t end_ns, int64_t device_id, int64_t stream_id,
                      uint32_t correlation_id, uint64_t bytes) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start_ns == 0 || end_ns == 0) {
@@ -215,8 +221,8 @@ class DeviceTracerImpl : public DeviceTracer {
                                      stream_id, correlation_id, bytes});
   }
 
-  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
-                        uint32_t stream_id, uint32_t correlation_id) {
+  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
+                        int64_t stream_id, uint32_t correlation_id) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start == 0 || end == 0) {
       VLOG(3) << correlation_id << " cannot be traced";
@@ -270,27 +276,30 @@ class DeviceTracerImpl : public DeviceTracer {
         continue;
       }
       auto *event = profile_pb.add_events();
+      event->set_type(proto::Event::GPUKernel);
       event->set_name(correlations_.at(r.correlation_id));
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
-      event->set_stream_id(r.stream_id);
+      event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
     }
 
     for (const CPURecord &r : cpu_records_) {
       auto *event = profile_pb.add_events();
+      event->set_type(proto::Event::CPU);
       event->set_name(r.name);
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
-      event->set_stream_id(r.thread_id);
-      event->set_device_id(-1);
+      event->set_sub_device_id(r.thread_id);
+      event->set_device_id(r.device_id);
     }
     for (const MemRecord &r : mem_records_) {
       auto *event = profile_pb.add_events();
+      event->set_type(proto::Event::GPUKernel);
       event->set_name(r.name);
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
-      event->set_stream_id(r.stream_id);
+      event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
       event->mutable_memcopy()->set_bytes(r.bytes);
     }
@@ -323,8 +332,9 @@ class DeviceTracerImpl : public DeviceTracer {
     if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
         (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
       if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-        const std::string anno =
-            cur_annotation ? cur_annotation : cbInfo->symbolName;
+        const std::string anno = !annotation_stack.empty()
+                                     ? annotation_stack.back()
+                                     : cbInfo->symbolName;
         tracer->AddAnnotation(cbInfo->correlationId, anno);
       }
     } else {
@@ -351,14 +361,15 @@ class DeviceTracerDummy : public DeviceTracer {
 
   void AddAnnotation(uint64_t id, const std::string &anno) {}
 
-  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
+  void AddCPURecords(const std::string &anno, uint64_t start_ns,
+                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
 
   void AddMemRecords(const std::string &name, uint64_t start_ns,
-                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint64_t end_ns, int64_t device_id, int64_t stream_id,
                      uint32_t correlation_id, uint64_t bytes) {}
 
-  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
-                        uint32_t stream_id, uint32_t correlation_id) {}
+  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
+                        int64_t stream_id, uint32_t correlation_id) {}
 
   bool IsEnabled() { return false; }
 
@@ -384,11 +395,28 @@ DeviceTracer *GetDeviceTracer() {
   return tracer;
 }
 
-void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
+void SetCurAnnotation(const std::string &anno) {
+  annotation_stack.push_back(anno);
+}
+
+void ClearCurAnnotation() { annotation_stack.pop_back(); }
+
+std::string CurAnnotation() {
+  if (annotation_stack.empty()) return "";
+  return annotation_stack.back();
+}
+
+void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
+
+void ClearCurBlock() { block_id_stack.pop_back(); }
+
+int BlockDepth() { return block_id_stack.size(); }
+
+void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
 
-void ClearCurAnnotation() { cur_annotation = nullptr; }
+void ClearCurThread() { cur_thread_id = 0; }
 
-const char *CurAnnotation() { return cur_annotation; }
+int CurThread() { return cur_thread_id; }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
@@ -32,22 +32,23 @@ class DeviceTracer {
   struct KernelRecord {
     uint64_t start_ns;
     uint64_t end_ns;
-    uint32_t device_id;
-    uint32_t stream_id;
+    int64_t device_id;
+    int64_t stream_id;
     uint32_t correlation_id;
   };
   struct CPURecord {
     std::string name;
     uint64_t start_ns;
     uint64_t end_ns;
-    uint64_t thread_id;
+    int64_t device_id;
+    int64_t thread_id;
   };
   struct MemRecord {
     std::string name;
     uint64_t start_ns;
     uint64_t end_ns;
-    uint32_t device_id;
-    uint32_t stream_id;
+    int64_t device_id;
+    int64_t stream_id;
     uint32_t correlation_id;
     uint64_t bytes;
   };
@@ -64,18 +65,18 @@ class DeviceTracer {
   virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
 
   virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
-                             uint64_t end_ns, uint32_t device_id,
-                             uint32_t stream_id, uint32_t correlation_id,
+                             uint64_t end_ns, int64_t device_id,
+                             int64_t stream_id, uint32_t correlation_id,
                              uint64_t bytes) = 0;
 
-  virtual void AddCPURecords(const char* anno, uint64_t start_ns,
-                             uint64_t end_ns) = 0;
+  virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
+                             uint64_t end_ns, int64_t device_id,
+                             int64_t thread_id) = 0;
 
   // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
   // added before for human readability.
-  virtual void AddKernelRecords(uint64_t start, uint64_t end,
-                                uint32_t device_id, uint32_t stream_id,
-                                uint32_t correlation_id) = 0;
+  virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
+                                int64_t stream_id, uint32_t correlation_id) = 0;
 
   // Generate a proto after done (Disabled).
   virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
@@ -87,10 +88,18 @@ class DeviceTracer {
 DeviceTracer* GetDeviceTracer();
 
 // Set a name for the cuda kernel operation being launched by the thread.
-void SetCurAnnotation(const char* anno);
+void SetCurAnnotation(const std::string& anno);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
 // Current name of the operation being run in the thread.
-const char* CurAnnotation();
+std::string CurAnnotation();
+
+void SetCurBlock(int block_id);
+void ClearCurBlock();
+int BlockDepth();
+
+void SetCurThread(int thread_id);
+void ClearCurThread();
+int CurThread();
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
@@ -147,19 +147,48 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
   name_ = name;
   PushEvent(name_, dev_ctx_);
   // Maybe need the same push/pop behavior.
-  SetCurAnnotation(name_.c_str());
+  SetCurAnnotation(name_);
 }
 
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
-    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
+    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
+                          BlockDepth(), CurThread());
   }
   ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
 }
 
+RecordBlock::RecordBlock(int block_id) : start_ns_(PosixInNsec()) {
+  if (g_state == ProfilerState::kDisabled) return;
+  SetCurBlock(block_id);
+  name_ = string::Sprintf("block_%d", block_id);
+}
+
+RecordBlock::~RecordBlock() {
+  if (g_state == ProfilerState::kDisabled) return;
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (tracer) {
+    // We try to put all blocks at the same nested depth in the
+    // same timeline lane. and distinguish the using thread_id.
+    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
+                          CurThread());
+  }
+  ClearCurBlock();
+}
+
+RecordThread::RecordThread(int thread_id) {
+  if (g_state == ProfilerState::kDisabled) return;
+  SetCurThread(thread_id);
+}
+
+RecordThread::~RecordThread() {
+  if (g_state == ProfilerState::kDisabled) return;
+  ClearCurThread();
+}
+
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",

diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
@@ -118,6 +118,24 @@ struct RecordEvent {
   std::string full_name_;
 };
 
+struct RecordBlock {
+  explicit RecordBlock(int block_id);
+  ~RecordBlock();
+
+ private:
+  std::string name_;
+  uint64_t start_ns_;
+  int block_id_;
+};
+
+struct RecordThread {
+  explicit RecordThread(int thread_id);
+  ~RecordThread();
+
+ private:
+  uint64_t start_ns_;
+};
+
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();