@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
1313limitations under the License. */
1414
1515#include " paddle/platform/profiler.h"
16+ #include < iomanip>
17+ #include < map>
18+ #include " glog/logging.h"
1619
1720namespace paddle {
1821namespace platform {
1922
2023// The profiler state, the initial value is ProfilerState::kDisabled
2124static ProfilerState g_state = ProfilerState::kDisabled ;
25+ // To record which timer the profiler used, CUDA or CPU.
26+ static std::string g_profiler_place = " " ;
2227// The thread local event list only can be accessed by the specific thread
2328// The thread index of each thread
2429static thread_local int32_t g_thread_id;
@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
4348
4449Event::Event (EventKind kind, std::string name, uint32_t thread_id,
4550 DeviceContext* dev_ctx)
46- : kind_(kind),
47- name_ (std::move(name)),
48- thread_id_(thread_id),
49- has_cuda_(false ) {
51+ : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false ) {
5052#ifdef PADDLE_WITH_CUDA
5153 auto * cuda_dev_ctx = static_cast <const CUDADeviceContext*>(dev_ctx);
5254 if (cuda_dev_ctx) {
@@ -72,19 +74,19 @@ std::string Event::kind() const {
7274 PADDLE_THROW (" Unknown EventKind." );
7375}
7476
75- double Event::CpuElapsedUs (const Event& e) const {
76- return (e.cpu_ns_ - cpu_ns_) / (1000 .0 );
77+ double Event::CpuElapsedMs (const Event& e) const {
78+ return (e.cpu_ns_ - cpu_ns_) / (1000000 .0 );
7779}
7880
79- double Event::CudaElapsedUs (const Event& e) const {
81+ double Event::CudaElapsedMs (const Event& e) const {
8082#ifdef PADDLE_WITH_CUDA
8183 PADDLE_ENFORCE (e.has_cuda () && has_cuda ());
8284 PADDLE_ENFORCE (e.device () == device ());
8385 PADDLE_ENFORCE (cudaEventSynchronize (event_));
8486 PADDLE_ENFORCE (cudaEventSynchronize (e.event ()));
8587 float ms;
8688 PADDLE_ENFORCE (cudaEventElapsedTime (&ms, event_, e.event ()));
87- return ms * 1000.0 ;
89+ return ms;
8890#else
8991 PADDLE_THROW (" CUDA is not enabled" );
9092#endif
@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
113115}
114116
115117void Mark (const std::string& name, DeviceContext* dev_ctx) {
116- GetEventList ().Record (EventKind::kMark , std::move (name), g_thread_id,
117- dev_ctx);
118+ GetEventList ().Record (EventKind::kMark , name, g_thread_id, dev_ctx);
119+ }
120+
121+ void PushEvent (const std::string& name, DeviceContext* dev_ctx) {
122+ GetEventList ().Record (EventKind::kPushRange , name, g_thread_id, dev_ctx);
123+ }
124+
125+ void PopEvent (const std::string& name, DeviceContext* dev_ctx) {
126+ GetEventList ().Record (EventKind::kPopRange , name, g_thread_id, dev_ctx);
118127}
119128
120129RecordEvent::RecordEvent (const std::string& name, DeviceContext* dev_ctx) {
121130 if (g_state == ProfilerState::kDisabled ) return ;
122131 dev_ctx_ = dev_ctx;
123- GetEventList (). Record (EventKind:: kPushRange , std::move (name), g_thread_id,
124- dev_ctx_);
132+ name_ = name;
133+ PushEvent (name_, dev_ctx_);
125134}
126135
127136RecordEvent::~RecordEvent () {
128137 if (g_state == ProfilerState::kDisabled ) return ;
129- GetEventList ().Record (EventKind::kPopRange , std::string (), g_thread_id,
130- dev_ctx_);
138+ PopEvent (name_, dev_ctx_);
131139}
132140
133141void EnableProfiler (ProfilerState state) {
@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
138146 " The profiling state should be disabled when calling " ,
139147 " EnableProfiler." );
140148 g_state = state;
149+ g_profiler_place = (g_state == ProfilerState::kCUDA ) ? " CUDA" : " CPU" ;
141150#ifdef PADDLE_WITH_CUDA
142151 if (g_state == ProfilerState::kCUDA ) {
143152 // Generate some dummy evenets first to reduce the startup overhead.
@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
169178 return result;
170179}
171180
181+ void ParseEvents (std::vector<std::vector<Event>>& events,
182+ EventSortingKey sorted_by) {
183+ if (g_profiler_place == " " ) return ;
184+
185+ std::string sorted_domain;
186+ std::function<bool (EventItem&, EventItem&)> sorted_func;
187+ switch (sorted_by) {
188+ case EventSortingKey::kCalls :
189+ sorted_domain = " number of calls" ;
190+ sorted_func = [](EventItem& a, EventItem& b) {
191+ return a.calls > b.calls ;
192+ };
193+ break ;
194+ case EventSortingKey::kTotal :
195+ sorted_domain = " total time" ;
196+ sorted_func = [](EventItem& a, EventItem& b) {
197+ return a.total_time > b.total_time ;
198+ };
199+ break ;
200+ case EventSortingKey::kMin :
201+ sorted_domain = " minimum time" ;
202+ sorted_func = [](EventItem& a, EventItem& b) {
203+ return a.min_time > b.min_time ;
204+ };
205+ break ;
206+ case EventSortingKey::kMax :
207+ sorted_domain = " maximum time" ;
208+ sorted_func = [](EventItem& a, EventItem& b) {
209+ return a.max_time > b.max_time ;
210+ };
211+ break ;
212+ case EventSortingKey::kAve :
213+ sorted_domain = " average time" ;
214+ sorted_func = [](EventItem& a, EventItem& b) {
215+ return a.ave_time > b.ave_time ;
216+ };
217+ break ;
218+ default :
219+ sorted_domain = " event end time" ;
220+ }
221+
222+ std::vector<std::vector<EventItem>> events_table;
223+ size_t max_name_width = 0 ;
224+ for (size_t i = 0 ; i < events.size (); i++) {
225+ std::list<Event> pushed_events;
226+ std::vector<EventItem> event_items;
227+ std::unordered_map<std::string, int > event_idx;
228+
229+ for (size_t j = 0 ; j < events[i].size (); j++) {
230+ if (events[i][j].kind () == " push" ) {
231+ pushed_events.push_back (events[i][j]);
232+ } else if (events[i][j].kind () == " pop" ) {
233+ std::list<Event>::reverse_iterator rit = pushed_events.rbegin ();
234+ while (rit != pushed_events.rend () &&
235+ rit->name () != events[i][j].name ()) {
236+ ++rit;
237+ }
238+
239+ if (rit != pushed_events.rend ()) {
240+ double event_time = (g_profiler_place == " CUDA" )
241+ ? rit->CudaElapsedMs (events[i][j])
242+ : rit->CpuElapsedMs (events[i][j]);
243+ std::string event_name =
244+ " thread" + std::to_string (rit->thread_id ()) + " ::" + rit->name ();
245+ max_name_width = std::max (max_name_width, event_name.size ());
246+
247+ if (event_idx.find (event_name) == event_idx.end ()) {
248+ event_idx[event_name] = event_items.size ();
249+ EventItem event_item = {event_name, 1 , event_time,
250+ event_time, event_time, event_time};
251+ event_items.push_back (event_item);
252+ } else {
253+ int index = event_idx[event_name];
254+ event_items[index].calls += 1 ;
255+ // total time
256+ event_items[index].total_time += event_time;
257+ // min time
258+ event_items[index].min_time =
259+ std::min (event_time, event_items[index].min_time );
260+ // max time
261+ event_items[index].max_time =
262+ std::max (event_time, event_items[index].max_time );
263+ }
264+
265+ // remove the push marker from the list
266+ pushed_events.erase ((++rit).base ());
267+ } else {
268+ LOG (WARNING) << " Cannot find the push marker of event \' "
269+ << events[i][j].name ()
270+ << " \' , which will be ignored in profiling report." ;
271+ }
272+ }
273+ }
274+ // average time
275+ for (auto & item : event_items) {
276+ item.ave_time = item.total_time / item.calls ;
277+ }
278+ // sort
279+ if (sorted_by != EventSortingKey::kDefault ) {
280+ std::sort (event_items.begin (), event_items.end (), sorted_func);
281+ }
282+
283+ events_table.push_back (event_items);
284+ // log warning if there are events with `push` but without `pop`
285+ std::list<Event>::reverse_iterator rit = pushed_events.rbegin ();
286+ while (rit != pushed_events.rend ()) {
287+ LOG (WARNING) << " Cannot find the pop marker of event \' " << rit->name ()
288+ << " \' , which will be ignored in profiling report." ;
289+ ++rit;
290+ }
291+ }
292+
293+ // Print report
294+ PrintProfilingReport (events_table, sorted_domain, max_name_width + 4 , 12 );
295+ }
296+
297+ void PrintProfilingReport (std::vector<std::vector<EventItem>>& events_table,
298+ std::string& sorted_domain, const size_t name_width,
299+ const size_t data_width) {
300+ // Output header information
301+ std::cout << " \n ------------------------->"
302+ << " Profiling Report "
303+ << " <-------------------------\n\n " ;
304+ std::cout << " Place: " << g_profiler_place << std::endl;
305+ std::cout << " Time unit: ms" << std::endl;
306+ std::cout << " Sorted by " << sorted_domain
307+ << " in descending order in the same thread\n\n " ;
308+ // Output events table
309+ std::cout.setf (std::ios::left);
310+ std::cout << std::setw (name_width) << " Event" << std::setw (data_width)
311+ << " Calls" << std::setw (data_width) << " Total"
312+ << std::setw (data_width) << " Min." << std::setw (data_width)
313+ << " Max." << std::setw (data_width) << " Ave." << std::endl;
314+ for (size_t i = 0 ; i < events_table.size (); ++i) {
315+ for (size_t j = 0 ; j < events_table[i].size (); ++j) {
316+ EventItem& event_item = events_table[i][j];
317+ std::cout << std::setw (name_width) << event_item.name
318+ << std::setw (data_width) << event_item.calls
319+ << std::setw (data_width) << event_item.total_time
320+ << std::setw (data_width) << event_item.min_time
321+ << std::setw (data_width) << event_item.max_time
322+ << std::setw (data_width) << event_item.ave_time << std::endl;
323+ }
324+ }
325+ std::cout << std::endl;
326+ }
327+
172328} // namespace platform
173329} // namespace paddle
0 commit comments