Skip to content

Commit d284cf8

Browse files
authored
Merge pull request #9037 from panyx0718/develop
Better timeline
2 parents 128adf5 + 4840c49 commit d284cf8

File tree

9 files changed

+190
-65
lines changed

9 files changed

+190
-65
lines changed

paddle/fluid/framework/executor.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ limitations under the License. */
2525
#include "paddle/fluid/framework/op_registry.h"
2626
#include "paddle/fluid/framework/reader.h"
2727
#include "paddle/fluid/platform/place.h"
28+
#include "paddle/fluid/platform/profiler.h"
2829

2930
DECLARE_bool(benchmark);
3031
DEFINE_bool(check_nan_inf, false,
@@ -33,6 +34,11 @@ DEFINE_bool(check_nan_inf, false,
3334

3435
namespace paddle {
3536
namespace framework {
37+
namespace {
38+
// block id starts from 0. This id is used to represent the codeblock
39+
// wrapping the first block 0.
40+
int kProgramId = -1;
41+
} // namespace
3642

3743
struct ExecutorPrepareContext {
3844
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
@@ -94,6 +100,7 @@ static void CheckTensorNANOrInf(const std::string& name,
94100

95101
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
96102
bool create_local_scope, bool create_vars) {
103+
platform::RecordBlock b(block_id);
97104
auto* ctx = Prepare(pdesc, block_id);
98105
RunPreparedContext(ctx, scope, create_local_scope, create_vars);
99106
delete ctx;
@@ -184,6 +191,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
184191
std::map<std::string, LoDTensor*>& fetch_targets,
185192
const std::string& feed_holder_name,
186193
const std::string& fetch_holder_name) {
194+
platform::RecordBlock b(kProgramId);
187195
auto* copy_program = new ProgramDesc(program);
188196
auto* global_block = copy_program->MutableBlock(0);
189197

paddle/fluid/operators/parallel_do_op.cc

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License. */
1818
#include "paddle/fluid/framework/op_registry.h"
1919
#include "paddle/fluid/framework/threadpool.h"
2020
#include "paddle/fluid/operators/detail/safe_ref.h"
21+
#include "paddle/fluid/platform/profiler.h"
2122

2223
namespace paddle {
2324
namespace operators {
@@ -158,11 +159,14 @@ class ParallelDoOp : public framework::OperatorBase {
158159
auto &place = places[place_idx];
159160
auto *cur_scope = sub_scopes[place_idx];
160161

161-
workers.emplace_back(framework::Async([program, cur_scope, place, block] {
162-
framework::Executor executor(place);
163-
executor.Run(*program, cur_scope, block->ID(),
164-
false /*create_local_scope*/);
165-
}));
162+
workers.emplace_back(
163+
framework::Async([program, cur_scope, place, block, place_idx] {
164+
// Give the thread an id to distinguish parallel block with same id.
165+
platform::RecordThread rt(static_cast<int>(place_idx) + 1);
166+
framework::Executor executor(place);
167+
executor.Run(*program, cur_scope, block->ID(),
168+
false /*create_local_scope*/);
169+
}));
166170
}
167171
for (auto &worker : workers) {
168172
worker.wait();
@@ -234,11 +238,14 @@ class ParallelDoGradOp : public framework::OperatorBase {
234238
auto *cur_scope = sub_scopes[i];
235239

236240
// execute
237-
workers.emplace_back(framework::Async([program, cur_scope, place, block] {
238-
framework::Executor executor(place);
239-
executor.Run(*program, cur_scope, block->ID(),
240-
false /*create_local_scope*/);
241-
}));
241+
workers.emplace_back(
242+
framework::Async([program, cur_scope, place, block, i] {
243+
// Give the thread an id to distinguish parallel block with same id.
244+
platform::RecordThread rt(static_cast<int>(i) + 1);
245+
framework::Executor executor(place);
246+
executor.Run(*program, cur_scope, block->ID(),
247+
false /*create_local_scope*/);
248+
}));
242249
}
243250
for (auto &worker : workers) {
244251
worker.wait();

paddle/fluid/platform/device_tracer.cc

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,14 @@ limitations under the License. */
2626
namespace paddle {
2727
namespace platform {
2828
namespace {
29+
// Current thread's id. Note, we don't distinguish nested threads
30+
// for now.
31+
thread_local int cur_thread_id = 0;
32+
// Tracking the nested block stacks of each thread.
33+
thread_local std::deque<int> block_id_stack;
34+
// Tracking the nested event stacks.
35+
thread_local std::deque<std::string> annotation_stack;
2936

30-
thread_local const char *cur_annotation = nullptr;
3137
std::once_flag tracer_once_flag;
3238
DeviceTracer *tracer = nullptr;
3339
} // namespace
@@ -191,19 +197,19 @@ class DeviceTracerImpl : public DeviceTracer {
191197
correlations_[id] = anno;
192198
}
193199

194-
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
195-
if (!anno) {
196-
// TODO(panyx0718): Currently, it doesn't support nested situation
197-
// Up-level can be cleared by low-level and therefore get nullptr
198-
// here.
200+
void AddCPURecords(const std::string &anno, uint64_t start_ns,
201+
uint64_t end_ns, int64_t device_id, int64_t thread_id) {
202+
if (anno.empty()) {
203+
VLOG(1) << "Empty timeline annotation.";
199204
return;
200205
}
201206
std::lock_guard<std::mutex> l(trace_mu_);
202-
cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0});
207+
cpu_records_.push_back(
208+
CPURecord{anno, start_ns, end_ns, device_id, thread_id});
203209
}
204210

205211
void AddMemRecords(const std::string &name, uint64_t start_ns,
206-
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
212+
uint64_t end_ns, int64_t device_id, int64_t stream_id,
207213
uint32_t correlation_id, uint64_t bytes) {
208214
// 0 means timestamp information could not be collected for the kernel.
209215
if (start_ns == 0 || end_ns == 0) {
@@ -215,8 +221,8 @@ class DeviceTracerImpl : public DeviceTracer {
215221
stream_id, correlation_id, bytes});
216222
}
217223

218-
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
219-
uint32_t stream_id, uint32_t correlation_id) {
224+
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
225+
int64_t stream_id, uint32_t correlation_id) {
220226
// 0 means timestamp information could not be collected for the kernel.
221227
if (start == 0 || end == 0) {
222228
VLOG(3) << correlation_id << " cannot be traced";
@@ -270,27 +276,30 @@ class DeviceTracerImpl : public DeviceTracer {
270276
continue;
271277
}
272278
auto *event = profile_pb.add_events();
279+
event->set_type(proto::Event::GPUKernel);
273280
event->set_name(correlations_.at(r.correlation_id));
274281
event->set_start_ns(r.start_ns);
275282
event->set_end_ns(r.end_ns);
276-
event->set_stream_id(r.stream_id);
283+
event->set_sub_device_id(r.stream_id);
277284
event->set_device_id(r.device_id);
278285
}
279286

280287
for (const CPURecord &r : cpu_records_) {
281288
auto *event = profile_pb.add_events();
289+
event->set_type(proto::Event::CPU);
282290
event->set_name(r.name);
283291
event->set_start_ns(r.start_ns);
284292
event->set_end_ns(r.end_ns);
285-
event->set_stream_id(r.thread_id);
286-
event->set_device_id(-1);
293+
event->set_sub_device_id(r.thread_id);
294+
event->set_device_id(r.device_id);
287295
}
288296
for (const MemRecord &r : mem_records_) {
289297
auto *event = profile_pb.add_events();
298+
event->set_type(proto::Event::GPUKernel);
290299
event->set_name(r.name);
291300
event->set_start_ns(r.start_ns);
292301
event->set_end_ns(r.end_ns);
293-
event->set_stream_id(r.stream_id);
302+
event->set_sub_device_id(r.stream_id);
294303
event->set_device_id(r.device_id);
295304
event->mutable_memcopy()->set_bytes(r.bytes);
296305
}
@@ -323,8 +332,9 @@ class DeviceTracerImpl : public DeviceTracer {
323332
if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
324333
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
325334
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
326-
const std::string anno =
327-
cur_annotation ? cur_annotation : cbInfo->symbolName;
335+
const std::string anno = !annotation_stack.empty()
336+
? annotation_stack.back()
337+
: cbInfo->symbolName;
328338
tracer->AddAnnotation(cbInfo->correlationId, anno);
329339
}
330340
} else {
@@ -351,14 +361,15 @@ class DeviceTracerDummy : public DeviceTracer {
351361

352362
void AddAnnotation(uint64_t id, const std::string &anno) {}
353363

354-
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
364+
void AddCPURecords(const std::string &anno, uint64_t start_ns,
365+
uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
355366

356367
void AddMemRecords(const std::string &name, uint64_t start_ns,
357-
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
368+
uint64_t end_ns, int64_t device_id, int64_t stream_id,
358369
uint32_t correlation_id, uint64_t bytes) {}
359370

360-
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
361-
uint32_t stream_id, uint32_t correlation_id) {}
371+
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
372+
int64_t stream_id, uint32_t correlation_id) {}
362373

363374
bool IsEnabled() { return false; }
364375

@@ -384,11 +395,28 @@ DeviceTracer *GetDeviceTracer() {
384395
return tracer;
385396
}
386397

387-
void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
398+
void SetCurAnnotation(const std::string &anno) {
399+
annotation_stack.push_back(anno);
400+
}
401+
402+
void ClearCurAnnotation() { annotation_stack.pop_back(); }
403+
404+
std::string CurAnnotation() {
405+
if (annotation_stack.empty()) return "";
406+
return annotation_stack.back();
407+
}
408+
409+
void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
410+
411+
void ClearCurBlock() { block_id_stack.pop_back(); }
412+
413+
int BlockDepth() { return block_id_stack.size(); }
414+
415+
void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
388416

389-
void ClearCurAnnotation() { cur_annotation = nullptr; }
417+
void ClearCurThread() { cur_thread_id = 0; }
390418

391-
const char *CurAnnotation() { return cur_annotation; }
419+
int CurThread() { return cur_thread_id; }
392420

393421
} // namespace platform
394422
} // namespace paddle

paddle/fluid/platform/device_tracer.h

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,23 @@ class DeviceTracer {
3232
struct KernelRecord {
3333
uint64_t start_ns;
3434
uint64_t end_ns;
35-
uint32_t device_id;
36-
uint32_t stream_id;
35+
int64_t device_id;
36+
int64_t stream_id;
3737
uint32_t correlation_id;
3838
};
3939
struct CPURecord {
4040
std::string name;
4141
uint64_t start_ns;
4242
uint64_t end_ns;
43-
uint64_t thread_id;
43+
int64_t device_id;
44+
int64_t thread_id;
4445
};
4546
struct MemRecord {
4647
std::string name;
4748
uint64_t start_ns;
4849
uint64_t end_ns;
49-
uint32_t device_id;
50-
uint32_t stream_id;
50+
int64_t device_id;
51+
int64_t stream_id;
5152
uint32_t correlation_id;
5253
uint64_t bytes;
5354
};
@@ -64,18 +65,18 @@ class DeviceTracer {
6465
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
6566

6667
virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
67-
uint64_t end_ns, uint32_t device_id,
68-
uint32_t stream_id, uint32_t correlation_id,
68+
uint64_t end_ns, int64_t device_id,
69+
int64_t stream_id, uint32_t correlation_id,
6970
uint64_t bytes) = 0;
7071

71-
virtual void AddCPURecords(const char* anno, uint64_t start_ns,
72-
uint64_t end_ns) = 0;
72+
virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
73+
uint64_t end_ns, int64_t device_id,
74+
int64_t thread_id) = 0;
7375

7476
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
7577
// added before for human readability.
76-
virtual void AddKernelRecords(uint64_t start, uint64_t end,
77-
uint32_t device_id, uint32_t stream_id,
78-
uint32_t correlation_id) = 0;
78+
virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
79+
int64_t stream_id, uint32_t correlation_id) = 0;
7980

8081
// Generate a proto after done (Disabled).
8182
virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
@@ -87,10 +88,18 @@ class DeviceTracer {
8788
DeviceTracer* GetDeviceTracer();
8889

8990
// Set a name for the cuda kernel operation being launched by the thread.
90-
void SetCurAnnotation(const char* anno);
91+
void SetCurAnnotation(const std::string& anno);
9192
// Clear the name after the operation is done.
9293
void ClearCurAnnotation();
9394
// Current name of the operation being run in the thread.
94-
const char* CurAnnotation();
95+
std::string CurAnnotation();
96+
97+
void SetCurBlock(int block_id);
98+
void ClearCurBlock();
99+
int BlockDepth();
100+
101+
void SetCurThread(int thread_id);
102+
void ClearCurThread();
103+
int CurThread();
95104
} // namespace platform
96105
} // namespace paddle

paddle/fluid/platform/profiler.cc

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,19 +147,48 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
147147
name_ = name;
148148
PushEvent(name_, dev_ctx_);
149149
// Maybe need the same push/pop behavior.
150-
SetCurAnnotation(name_.c_str());
150+
SetCurAnnotation(name_);
151151
}
152152

153153
RecordEvent::~RecordEvent() {
154154
if (g_state == ProfilerState::kDisabled) return;
155155
DeviceTracer* tracer = GetDeviceTracer();
156156
if (tracer) {
157-
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
157+
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
158+
BlockDepth(), CurThread());
158159
}
159160
ClearCurAnnotation();
160161
PopEvent(name_, dev_ctx_);
161162
}
162163

164+
RecordBlock::RecordBlock(int block_id) : start_ns_(PosixInNsec()) {
165+
if (g_state == ProfilerState::kDisabled) return;
166+
SetCurBlock(block_id);
167+
name_ = string::Sprintf("block_%d", block_id);
168+
}
169+
170+
RecordBlock::~RecordBlock() {
171+
if (g_state == ProfilerState::kDisabled) return;
172+
DeviceTracer* tracer = GetDeviceTracer();
173+
if (tracer) {
174+
// We try to put all blocks at the same nested depth in the
175+
// same timeline lane. and distinguish the using thread_id.
176+
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
177+
CurThread());
178+
}
179+
ClearCurBlock();
180+
}
181+
182+
RecordThread::RecordThread(int thread_id) {
183+
if (g_state == ProfilerState::kDisabled) return;
184+
SetCurThread(thread_id);
185+
}
186+
187+
RecordThread::~RecordThread() {
188+
if (g_state == ProfilerState::kDisabled) return;
189+
ClearCurThread();
190+
}
191+
163192
void EnableProfiler(ProfilerState state) {
164193
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
165194
"Can't enbale profling, since the input state is ",

paddle/fluid/platform/profiler.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,24 @@ struct RecordEvent {
118118
std::string full_name_;
119119
};
120120

121+
struct RecordBlock {
122+
explicit RecordBlock(int block_id);
123+
~RecordBlock();
124+
125+
private:
126+
std::string name_;
127+
uint64_t start_ns_;
128+
int block_id_;
129+
};
130+
131+
struct RecordThread {
132+
explicit RecordThread(int thread_id);
133+
~RecordThread();
134+
135+
private:
136+
uint64_t start_ns_;
137+
};
138+
121139
// Return the event list of all threads. Assumed the returned value calls
122140
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
123141
std::vector<std::vector<Event>> GetAllEvents();

0 commit comments

Comments
 (0)