Compute info_B_num_bits from T to make it a constant (#3748)

spcyppt · facebook-github-bot · commit 81cb119f2bfd · 2025-02-28T18:09:56.000-08:00
Summary: X-link: facebookresearch/FBGEMM#829 ` b_t_map` contains information of batch (`b`) and feature (`t`). `info_B_num_bits` tells how many bits are used to cover batch information and is currently computed each iteration given the batch size. The `info_B_num_bits` calculation is problematic when `max_B_` is symbolic, causing issues with eagerAOT mode. If `max_B_` is symbolic, `info_B_num_bits` is not recomputed and uses the default value which can fail or if there is not enough bits for B. To resolve the issues, we can make `info_B_num_bits` constant. Current implementation adjusts `info_B_num_bits` based on the batch size, causing it to change every iteration. Fixing the values may cause the aforementioned issue of having insufficient bits for B. This diff implements `get_info_B_num_bits_from_T` to make `info_B_num_bits` constant. We first calculate how many bits required to cover `T` information, as number of features are known at TBE initialization and will remain the same throughout the run. The rest of the bits will be for `B` information. Since `info_T_num_bits` remains the same, `info_B_num_bits` remains the same. If there's not enough bits for B, it will fail. In V1 interface, since we hit the limit for the maximum number of arguments, we keep the interface the same. In V2 interface (next diff), we compute `info_B_num_bits` and `info_B_mask` once, store them as module parameters, and pass them to lookup and corresponding Autograd and backend functions. Reviewed By: sryap Differential Revision: D69387123
diff --git a/fbgemm_gpu/codegen/genscript/generate_backward_split.py b/fbgemm_gpu/codegen/genscript/generate_backward_split.py
@@ -413,6 +413,8 @@ def generate() -> None:
             ],
             "aux_int": [
                 "iter",  # 0
+                "info_B_num_bits",  # 1
+                "info_B_mask",  # 2
             ],
             "aux_float": [
                 "gwd_lower_bound",  # 0
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -685,19 +685,8 @@ class {{ autograd_func }} :
     // Default values for Dynamo tracing
     // SymInt does not support bitshifts operator
     // Constanting info_B_num_bits, info_B_mask for Dynamo for now.
-    int32_t info_B_num_bits = DEFAULT_INFO_B_NUM_BITS;
-    uint32_t info_B_mask = (1u << info_B_num_bits) - 1;
-    if (max_B_.is_symbolic()) {
-      // int32_t info_B_num_bits = 22;
-      // uint32_t info_B_mask = (1u << info_B_num_bits) - 1;
-
-      // TODO(ivankobzarev): Guarding Dynamo that T and B fits in constanted number of bits.
-      // TORCH_CHECK(max_B_ < 1u << info_B_num_bits)
-      // TORCH_CHECK(T < 1u << (DEFAULT_INFO_NUM_BITS - info_B_num_bits))
-    } else {
-      // TODO: don't guard here
-      std::tie(info_B_num_bits, info_B_mask) = adjust_info_B_num_bits(max_B_.guard_int(__FILE__, __LINE__), T.guard_int(__FILE__, __LINE__));
-    }
+    const auto info_B_num_bits = static_cast<int32_t>(aux_int[IDX_INFO_B_NUM_BITS]);
+    const auto info_B_mask = static_cast<uint32_t>(aux_int[IDX_INFO_B_MASK]);
 
     {%- if vbe %}
     static auto generate_vbe_metadata_op =
diff --git a/fbgemm_gpu/codegen/training/python/lookup_args.template b/fbgemm_gpu/codegen/training/python/lookup_args.template
@@ -77,6 +77,36 @@ class OptimizerArgs(NamedTuple):
     regularization_mode: int
     use_rowwise_bias_correction: bool # Used for OptimType.ADAM
 
+class CommonArgsPT2(NamedTuple):
+    placeholder_autograd_tensor: torch.Tensor
+    dev_weights: torch.Tensor
+    host_weights: torch.Tensor
+    uvm_weights: torch.Tensor
+    lxu_cache_weights: torch.Tensor
+    weights_placements: torch.Tensor
+    weights_offsets: torch.Tensor
+    D_offsets: torch.Tensor
+    total_D: int
+    max_D: int
+    hash_size_cumsum: torch.Tensor
+    total_hash_size_bits: int
+    indices: torch.Tensor
+    offsets: torch.Tensor
+    pooling_mode: int
+    indice_weights: Optional[torch.Tensor]
+    feature_requires_grad: Optional[torch.Tensor]
+    lxu_cache_locations: torch.Tensor
+    uvm_cache_stats: Optional[torch.Tensor]
+    output_dtype: int
+    vbe_metadata: VBEMetadata
+    is_experimental: bool
+    use_uniq_cache_locations_bwd: bool
+    use_homogeneous_placements: bool
+    info_B_num_bits: int
+    info_B_mask: int
+    {%- if ssd %}
+    ssd_tensors: Dict[str, torch.Tensor]
+    {%- endif %}
 
 class OptimizerArgsPT2(NamedTuple):
     """
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.h b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.h
@@ -22,6 +22,7 @@ std::tuple<int64_t, int64_t>
 get_infos_metadata(at::Tensor unused, int64_t B, int64_t T);
 
 std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(int32_t B, int32_t T);
+std::tuple<int32_t, uint32_t> get_info_B_num_bits_from_T(int32_t T, int32_t B);
 
 std::tuple<at::Tensor /*row_output_offsets*/, at::Tensor /*b_t_map*/>
 generate_vbe_metadata(
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu b/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu
@@ -186,7 +186,7 @@ DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda(
 
   int32_t info_B_num_bits;
   uint32_t info_B_mask;
-  std::tie(info_B_num_bits, info_B_mask) = adjust_info_B_num_bits(B, T);
+  std::tie(info_B_num_bits, info_B_mask) = get_info_B_num_bits_from_T(B, T);
 
   // weight: [N, sum_E]
   // total_hash_size_bits = log2(sum_E)
diff --git a/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu b/fbgemm_gpu/src/split_embeddings_utils/get_infos_metadata.cu
@@ -15,5 +15,5 @@ using namespace fbgemm_gpu;
 
 DLL_PUBLIC std::tuple<int64_t, int64_t>
 get_infos_metadata(Tensor unused, int64_t B, int64_t T) {
-  return adjust_info_B_num_bits(B, T);
+  return get_info_B_num_bits_from_T(T, B);
 }
diff --git a/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils_cpu.cpp b/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils_cpu.cpp
@@ -13,6 +13,53 @@
 
 using Tensor = at::Tensor;
 
+/// Find number of bits to accommodate this value
+///
+/// num_bits           number of bits to needed to accommodate
+///                    e.g., the function returns 3 if `n` is between 4 (100)
+///                    and 7 (111) as 3 bits are required to represent the
+///                    number.
+///
+/// @param n           positive decimal number
+///
+DLL_PUBLIC int32_t get_num_bits(int32_t n) {
+  TORCH_CHECK(n > 0, "Expect n to be positive but got ", n);
+  return static_cast<int32_t>(std::floor(std::log2(n) + 1));
+}
+
+/// Calculates number of bits to accommodate batch size (B) and table (T) from
+/// T. We first calculate how many bits needed for T and the rest is for B,
+/// since T does not change once TBE is initialized but B can change.
+///
+/// info_B_num_bits     Number of bits needed for accommodate batch size
+/// info_B_mask         Bit mask for information of B
+/// @param T            Number of tables (features)
+/// @param B            Batch size
+///
+DLL_PUBLIC std::tuple<int32_t, uint32_t> get_info_B_num_bits_from_T(
+    int32_t T,
+    int32_t B = 1) {
+  TORCH_CHECK(B > 0, "B must be positive. Got B = ", B);
+  TORCH_CHECK(T > 0, "T must be positive. Got T = ", T);
+  const int32_t info_T_num_bits = get_num_bits(T);
+  const int32_t info_B_num_bits = DEFAULT_INFO_NUM_BITS - info_T_num_bits;
+  const uint32_t info_B_mask = (1u << info_B_num_bits) - 1;
+  TORCH_CHECK(
+      B <= info_B_mask,
+      "Not enough infos bits to accommodate T and B. T = ",
+      T,
+      " takes ",
+      info_T_num_bits,
+      " and info_B_num_bits is ",
+      info_B_num_bits,
+      ". Expect max_B = ",
+      info_B_mask,
+      "but got B ",
+      B);
+
+  return {info_B_num_bits, info_B_mask};
+}
+
 DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
     int32_t B,
     int32_t T) {
@@ -79,7 +126,7 @@ generate_vbe_metadata_cpu(
 
 std::tuple<int64_t, int64_t>
 get_infos_metadata_cpu(Tensor unused, int64_t B, int64_t T) {
-  return adjust_info_B_num_bits(B, T);
+  return get_info_B_num_bits_from_T(T, B);
 }
 
 } // namespace
diff --git a/fbgemm_gpu/test/tbe/utils/split_embeddings_utils_test.py b/fbgemm_gpu/test/tbe/utils/split_embeddings_utils_test.py
@@ -190,7 +190,8 @@ def test_transpose(self, B: int, T: int, E: int) -> None:
         self.assertTrue(
             torch.equal(linear_indices_sorted.cpu(), linear_indices_sorted_ref)
         )
-        self.assertTrue(torch.equal(infos_sorted.cpu(), infos_sorted_ref))
+        infos_sorted = infos_sorted.cpu()
+        self.assertTrue(torch.equal(infos_sorted, infos_sorted_ref.to(torch.int32)))
 
         # fbgemm impl has padding so we need slice
         num = sorted_linear_indices_run_ref.numel()

Original file line number	Diff line number	Diff line change
`@@ -15,5 +15,5 @@ using namespace fbgemm_gpu;`
`15`	`15`
`16`	`16`	`DLL_PUBLIC std::tuple<int64_t, int64_t>`
`17`	`17`	`get_infos_metadata(Tensor unused, int64_t B, int64_t T) {`
`18`		`- return adjust_info_B_num_bits(B, T);`
	`18`	`+ return get_info_B_num_bits_from_T(T, B);`
`19`	`19`	`}`
Original file line number	Diff line number	Diff line change
`@@ -190,7 +190,8 @@ def test_transpose(self, B: int, T: int, E: int) -> None:`
`190`	`190`	`self.assertTrue(`
`191`	`191`	`torch.equal(linear_indices_sorted.cpu(), linear_indices_sorted_ref)`
`192`	`192`	`)`
`193`		`- self.assertTrue(torch.equal(infos_sorted.cpu(), infos_sorted_ref))`
	`193`	`+ infos_sorted = infos_sorted.cpu()`
	`194`	`+ self.assertTrue(torch.equal(infos_sorted, infos_sorted_ref.to(torch.int32)))`
`194`	`195`
`195`	`196`	`# fbgemm impl has padding so we need slice`
`196`	`197`	`num = sorted_linear_indices_run_ref.numel()`