Skip to content

Commit 60a20b3

Browse files
q10facebook-github-bot
authored andcommitted
Add barrier to test regression hypothesis (#3741)
Summary: Pull Request resolved: #3741 - Add a barrier in front of the `*_warp_per_row_1` kernel invocation to test if the perf regressions might be due to other GPU processes competing for memory bandwidth. Reviewed By: sryap Differential Revision: D70227844 fbshipit-source-id: 639ef1cc04a525b556a3c503fc31fbc7ce82123d
1 parent 7c79d33 commit 60a20b3

File tree

2 files changed

+117
-67
lines changed

2 files changed

+117
-67
lines changed

fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu

Lines changed: 77 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@
2121
{%- set locs_or_addrs_type = "int64_t" if ssd else "int32_t" %}
2222

2323
#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
24-
#include "fbgemm_gpu/utils/tensor_accessor.h"
2524
#include "fbgemm_gpu/sparse_ops.h"
2625
#include "fbgemm_gpu/split_embeddings_utils.cuh"
26+
#include "fbgemm_gpu/utils/barrier_isolation.cuh"
2727
#include "fbgemm_gpu/utils/ops_utils.h"
28-
28+
#include "fbgemm_gpu/utils/tensor_accessor.h"
2929
{%- if is_rocm %}
3030
#include "fbgemm_gpu/rocm/cdna_guard.h"
3131
{%- endif %}
@@ -790,32 +790,34 @@ Tensor {{ embedding_cuda_op }}(
790790
// {{ locs_or_addrs_tensor }} run ids and sorted_linear_indices run ids.
791791
auto dev_or_uvm_unique_indices = at::zeros_like(weights_placements);
792792

793+
DEBUG_KERNEL_BARRIER_ISOLATE([&] {
793794
#ifdef FBGEMM_GPU_MEMCHECK
794795
const auto func_name = "split_embedding_backward_count_unique_indices_kernel";
795796
#endif
796-
split_embedding_backward_count_unique_indices_kernel<
797-
{{ "int64_t" if nobag else "int32_t" }},
798-
{{ "int64_t" if nobag else "uint32_t" }},
799-
{{ "true" if nobag else "false" }}
800-
><<<
801-
div_round_up(total_unique_indices, kMaxThreads),
802-
kMaxThreads,
803-
0,
804-
at::cuda::getCurrentCUDAStream()
805-
>>>(
806-
MAKE_PTA_WITH_NAME(
807-
func_name, sorted_linear_indices_num_runs, int32_t, 1, 32),
808-
MAKE_PTA_WITH_NAME(
809-
func_name, sorted_linear_indices_cumulative_run_lengths, int32_t, 1, 32),
810-
MAKE_PTA_WITH_NAME(
811-
func_name, infos_sorted, {{ "int64_t" if nobag else "int32_t" }}, 1, 32),
812-
MAKE_PTA_WITH_NAME(
813-
func_name, weights_placements, int32_t, 1, 32),
814-
MAKE_PTA_WITH_NAME(
815-
func_name, dev_or_uvm_unique_indices, int32_t, 1, 32),
816-
info_B_num_bits
817-
);
818-
C10_CUDA_KERNEL_LAUNCH_CHECK();
797+
split_embedding_backward_count_unique_indices_kernel<
798+
{{ "int64_t" if nobag else "int32_t" }},
799+
{{ "int64_t" if nobag else "uint32_t" }},
800+
{{ "true" if nobag else "false" }}
801+
><<<
802+
div_round_up(total_unique_indices, kMaxThreads),
803+
kMaxThreads,
804+
0,
805+
at::cuda::getCurrentCUDAStream()
806+
>>>(
807+
MAKE_PTA_WITH_NAME(
808+
func_name, sorted_linear_indices_num_runs, int32_t, 1, 32),
809+
MAKE_PTA_WITH_NAME(
810+
func_name, sorted_linear_indices_cumulative_run_lengths, int32_t, 1, 32),
811+
MAKE_PTA_WITH_NAME(
812+
func_name, infos_sorted, {{ "int64_t" if nobag else "int32_t" }}, 1, 32),
813+
MAKE_PTA_WITH_NAME(
814+
func_name, weights_placements, int32_t, 1, 32),
815+
MAKE_PTA_WITH_NAME(
816+
func_name, dev_or_uvm_unique_indices, int32_t, 1, 32),
817+
info_B_num_bits
818+
);
819+
C10_CUDA_KERNEL_LAUNCH_CHECK();
820+
}); // DEBUG_KERNEL_BARRIER_ISOLATE
819821
820822
table_unique_indices_offsets =
821823
fbgemm_gpu::asynchronous_complete_cumsum_gpu(dev_or_uvm_unique_indices).to(at::kInt);
@@ -940,31 +942,32 @@ Tensor {{ embedding_cuda_op }}(
940942
grad_output_mean = at::empty_like(grad_output_reshaped);
941943
{%- if not dense or not vbe %}
942944
945+
DEBUG_KERNEL_BARRIER_ISOLATE([&] {
943946
#ifdef FBGEMM_GPU_MEMCHECK
944-
const auto func_name1 = "grad_mean{{ vdesc }}_kernel";
947+
const auto func_name1 = "grad_mean{{ vdesc }}_kernel";
945948
#endif
949+
grad_mean{{ vdesc }}_kernel<<<
950+
div_round_up(total_B, kMaxThreads / kWarpSize),
951+
dim3(kWarpSize, kMaxThreads / kWarpSize),
952+
0,
953+
at::cuda::getCurrentCUDAStream()>>>
954+
(
955+
MAKE_PTA_WITH_NAME(func_name1, grad_output_mean, grad_t, 2, 64),
956+
MAKE_PTA_WITH_NAME(func_name1, grad_output_reshaped, grad_t, 2, 64),
957+
MAKE_PTA_WITH_NAME(func_name1, D_offsets, int32_t, 1, 32),
958+
MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32),
959+
{%- if vbe %}
960+
MAKE_PTA_WITH_NAME(func_name1, vbe_row_output_offsets, int64_t, 1, 32),
961+
MAKE_PTA_WITH_NAME(func_name1, vbe_b_t_map, int32_t, 1, 32),
962+
info_B_num_bits,
963+
info_B_mask
964+
{%- else %}
965+
FixedDivisor(total_B / T)
966+
{%- endif %}
967+
);
946968
947-
grad_mean{{ vdesc }}_kernel<<<
948-
div_round_up(total_B, kMaxThreads / kWarpSize),
949-
dim3(kWarpSize, kMaxThreads / kWarpSize),
950-
0,
951-
at::cuda::getCurrentCUDAStream()>>>
952-
(
953-
MAKE_PTA_WITH_NAME(func_name1, grad_output_mean, grad_t, 2, 64),
954-
MAKE_PTA_WITH_NAME(func_name1, grad_output_reshaped, grad_t, 2, 64),
955-
MAKE_PTA_WITH_NAME(func_name1, D_offsets, int32_t, 1, 32),
956-
MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32),
957-
{%- if vbe %}
958-
MAKE_PTA_WITH_NAME(func_name1, vbe_row_output_offsets, int64_t, 1, 32),
959-
MAKE_PTA_WITH_NAME(func_name1, vbe_b_t_map, int32_t, 1, 32),
960-
info_B_num_bits,
961-
info_B_mask
962-
{%- else %}
963-
FixedDivisor(total_B / T)
964-
{%- endif %}
965-
);
966-
967-
C10_CUDA_KERNEL_LAUNCH_CHECK();
969+
C10_CUDA_KERNEL_LAUNCH_CHECK();
970+
}); // DEBUG_KERNEL_BARRIER_ISOLATE
968971
{%- endif %} // if not dense or not vbe
969972
970973
grad_output_accessor = MAKE_PTA_WITH_NAME("{{ embedding_cuda_op }}.2", grad_output_mean, grad_t, 2, 64);
@@ -1005,27 +1008,29 @@ Tensor {{ embedding_cuda_op }}(
10051008
use_deterministic_algorithms ? 0 : (indices.numel() / max_segment_length_per_cta),
10061009
indices.options().dtype(at::kInt));
10071010
1011+
DEBUG_KERNEL_BARRIER_ISOLATE([&] {
10081012
#ifdef FBGEMM_GPU_MEMCHECK
1009-
const auto func_name2 = "split_embedding_backward_codegen_find_long_segments";
1013+
const auto func_name2 = "split_embedding_backward_codegen_find_long_segments";
10101014
#endif
10111015
1012-
split_embedding_backward_codegen_find_long_segments<<<
1013-
div_round_up(total_unique_indices, kMaxThreads),
1014-
kMaxThreads,
1015-
0,
1016-
at::cuda::getCurrentCUDAStream()
1017-
>>>(
1018-
MAKE_PTA_WITH_NAME(func_name2, sorted_linear_indices_num_runs, int32_t, 1, 32),
1019-
MAKE_PTA_WITH_NAME(func_name2, sorted_linear_indices_run_lengths, int32_t, 1, 32),
1020-
MAKE_PTA_WITH_NAME(func_name2, long_run_ids, int32_t, 1, 32),
1021-
MAKE_PTA_WITH_NAME(func_name2, num_long_run_ids, int32_t, 1, 32),
1022-
MAKE_PTA_WITH_NAME(func_name2, long_run_id_to_really_long_run_ids, int32_t, 1, 32),
1023-
MAKE_PTA_WITH_NAME(func_name2, num_really_long_run_ids, int32_t, 1, 32),
1024-
MAKE_PTA_WITH_NAME(func_name2, grad_accum_counter, int32_t, 1, 32),
1025-
max_segment_length_per_warp,
1026-
max_segment_length_per_cta,
1027-
use_deterministic_algorithms);
1028-
C10_CUDA_KERNEL_LAUNCH_CHECK();
1016+
split_embedding_backward_codegen_find_long_segments<<<
1017+
div_round_up(total_unique_indices, kMaxThreads),
1018+
kMaxThreads,
1019+
0,
1020+
at::cuda::getCurrentCUDAStream()
1021+
>>>(
1022+
MAKE_PTA_WITH_NAME(func_name2, sorted_linear_indices_num_runs, int32_t, 1, 32),
1023+
MAKE_PTA_WITH_NAME(func_name2, sorted_linear_indices_run_lengths, int32_t, 1, 32),
1024+
MAKE_PTA_WITH_NAME(func_name2, long_run_ids, int32_t, 1, 32),
1025+
MAKE_PTA_WITH_NAME(func_name2, num_long_run_ids, int32_t, 1, 32),
1026+
MAKE_PTA_WITH_NAME(func_name2, long_run_id_to_really_long_run_ids, int32_t, 1, 32),
1027+
MAKE_PTA_WITH_NAME(func_name2, num_really_long_run_ids, int32_t, 1, 32),
1028+
MAKE_PTA_WITH_NAME(func_name2, grad_accum_counter, int32_t, 1, 32),
1029+
max_segment_length_per_warp,
1030+
max_segment_length_per_cta,
1031+
use_deterministic_algorithms);
1032+
C10_CUDA_KERNEL_LAUNCH_CHECK();
1033+
}); // DEBUG_KERNEL_BARRIER_ISOLATE
10291034
10301035
// A temp buffer to accumulate gradients with atomics.
10311036
auto temp_grad_accum = at::zeros(
@@ -1079,8 +1084,9 @@ Tensor {{ embedding_cuda_op }}(
10791084
div_round_up(total_unique_indices, kMaxThreads),
10801085
get_max_thread_blocks_());
10811086
1087+
DEBUG_KERNEL_BARRIER_ISOLATE([&] {
10821088
#ifdef FBGEMM_GPU_MEMCHECK
1083-
const auto func_name3 = "{{ cta_kernel }}";
1089+
const auto func_name3 = "{{ cta_kernel }}";
10841090
#endif
10851091
backward_cta_per_row_kernel
10861092
<<<cta_per_row_grid_size,
@@ -1161,6 +1167,8 @@ Tensor {{ embedding_cuda_op }}(
11611167
);
11621168
11631169
C10_CUDA_KERNEL_LAUNCH_CHECK();
1170+
}); // DEBUG_KERNEL_BARRIER_ISOLATE
1171+
11641172
{%- set warp_kernel =
11651173
"batch_index_select_dim0_codegen_backward_kernel_warp_per_row"
11661174
if is_index_select else
@@ -1241,7 +1249,7 @@ Tensor {{ embedding_cuda_op }}(
12411249
{%- endif %}
12421250
#endif
12431251
1244-
1252+
DEBUG_KERNEL_BARRIER_ISOLATE([&] {
12451253
#ifdef FBGEMM_GPU_MEMCHECK
12461254
const auto func_name4 = "{{ warp_kernel }}";
12471255
#endif
@@ -1316,6 +1324,8 @@ Tensor {{ embedding_cuda_op }}(
13161324
{%- endif %}
13171325
);
13181326
C10_CUDA_KERNEL_LAUNCH_CHECK();
1327+
1328+
}); // DEBUG_KERNEL_BARRIER_ISOLATE
13191329
}); // DISPATCH_PLACEHOLDER_TYPES
13201330
return;
13211331
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <cuda.h>
12+
13+
////////////////////////////////////////////////////////////////////////////////
14+
// Kernel Barrier Isolation
15+
//
16+
// The kernel barrier isolation macro is a performance profiling tool that
17+
// isolates kernel execution from other GPU processes that might otherwise have
18+
// been running concurrently. This is used in conjunction with trace inspection
19+
// to determine whether a kernel's regression might be due to other GPU
20+
// processes competing for memory bandwidth that is causing the kernel slowdown,
21+
// which can be especially relevant when data accessed by the kernel is in UVM.
22+
////////////////////////////////////////////////////////////////////////////////
23+
24+
#ifdef FBGEMM_GPU_KERNEL_DEBUG
25+
26+
#define DEBUG_KERNEL_BARRIER_ISOLATE(...) \
27+
do { \
28+
cudaDeviceSynchronize(); \
29+
__VA_ARGS__(); \
30+
cudaDeviceSynchronize(); \
31+
} while (0);
32+
33+
#else
34+
35+
#define DEBUG_KERNEL_BARRIER_ISOLATE(...) \
36+
do { \
37+
__VA_ARGS__(); \
38+
} while (0);
39+
40+
#endif

0 commit comments

Comments
 (0)