Skip to content

Commit 631bb8d

Browse files
jianyuhfacebook-github-bot
authored andcommitted
Add more comments for infer TBE kernel for vendor optimizations (#948)
Summary: Pull Request resolved: #948 As title: just adding more comments in infer TBE kernel Reviewed By: jspark1105 Differential Revision: D34430782 fbshipit-source-id: 1601f9c6232eff4bea9f4f517cb888656e6d30a3
1 parent c3a26e1 commit 631bb8d

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,9 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
322322
continue;
323323
}
324324
const uint32_t* row = reinterpret_cast<const uint32_t*>(&buffers[warp_idx][i][input_row_idx][0]);
325+
// scale and bias are at the beginning of each row.
326+
// rationale: have scale/shift at start since these get loaded first
327+
// and then broadcasted around so it might speed up the first cache miss.
325328
{% if bit_width in [8, 4, 2] %}
326329
half2 shift_scale = reinterpret_cast<const half2*>(row)[0];
327330
{% endif %}
@@ -347,6 +350,10 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
347350
if (std::is_same<output_t, float>::value || std::is_same<output_t, at::Half>::value) {
348351
#pragma unroll MaxNum128BRows
349352
for (uint32_t j = 0; j < MaxNum128BRows; ++j) {
353+
// Read the uint8/4/2 values: note that first 4 Bytes will be ditched later:
354+
// We shift back by 4/8/16 elements to remove the first 4 Bytes (which is garbage due to
355+
// the scale/shift handling).
356+
// Reason: to avoid divergence the first thread in the warp computes garbage.
350357
int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
351358
scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
352359
if (output_d >= 0 && output_d < D) {

0 commit comments

Comments
 (0)