AdagradW (fbgemm frontend) (#3850)

minhua-chen · facebook-github-bot · commit d82f05f2e3d9 · 2025-03-27T14:17:42.000-07:00
Summary: X-link: facebookresearch/FBGEMM#938 AdagradW (fbgemm frontend) Differential Revision: D71102031
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -99,6 +99,7 @@ class CounterWeightDecayMode(enum.IntEnum):
     NONE = 0
     L2 = 1
     DECOUPLE = 2
+    ADAGRADW = 3
 
 
 class StepMode(enum.IntEnum):
@@ -2657,6 +2658,12 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]:
             list_of_state_dict = [
                 (
                     {"sum": states[0], "prev_iter": states[1], "row_counter": states[2]}
+                    | (
+                        {"iter": self.iter}
+                        if self.optimizer_args.weight_decay_mode
+                        == CounterWeightDecayMode.ADAGRADW.value
+                        else {}
+                    )
                     if self._used_rowwise_adagrad_with_counter
                     else (
                         {
diff --git a/fbgemm_gpu/test/tbe/training/backward_optimizers_test.py b/fbgemm_gpu/test/tbe/training/backward_optimizers_test.py
@@ -107,6 +107,7 @@ def execute_backward_optimizers_(  # noqa C901
         uvm_non_rowwise_momentum: bool = False,
         optimizer_state_dtypes: Optional[Dict[str, SparseType]] = None,
         use_rowwise_bias_correction: bool = False,
+        counter_weight_decay_mode: Optional[CounterWeightDecayMode] = None,
     ) -> None:
         # NOTE: limit (T * B * L * D) to avoid timeout for CPU version!
 
@@ -152,6 +153,11 @@ def execute_backward_optimizers_(  # noqa C901
             return
         if mixed_B and (use_cpu or pooling_mode == PoolingMode.NONE):
             return
+        if (
+            pooling_mode == PoolingMode.NONE
+            and counter_weight_decay_mode == CounterWeightDecayMode.ADAGRADW
+        ):
+            return
 
         emb_op = SplitTableBatchedEmbeddingBagsCodegen
         if pooling_mode == PoolingMode.SUM:
@@ -278,12 +284,22 @@ def execute_backward_optimizers_(  # noqa C901
             optimizer_kwargs["weight_decay_mode"] = weight_decay_mode
 
             if weight_decay_mode == WeightDecayMode.COUNTER:
+                opt_arg_weight_decay_mode: CounterWeightDecayMode = (
+                    counter_weight_decay_mode
+                    if counter_weight_decay_mode is not None
+                    else CounterWeightDecayMode.DECOUPLE
+                )
+                opt_arg_learning_rate_mode: LearningRateMode = (
+                    LearningRateMode.TAIL_ID_LR_DECREASE
+                    if opt_arg_weight_decay_mode != CounterWeightDecayMode.ADAGRADW
+                    else LearningRateMode.EQUAL
+                )
                 counter_based_regularization = CounterBasedRegularizationDefinition(
-                    counter_weight_decay_mode=CounterWeightDecayMode.DECOUPLE,
+                    counter_weight_decay_mode=opt_arg_weight_decay_mode,
                     counter_halflife=20000,
-                    adjustment_iter=24000,
+                    adjustment_iter=-1,
                     adjustment_ub=0.1,
-                    learning_rate_mode=LearningRateMode.TAIL_ID_LR_DECREASE,
+                    learning_rate_mode=opt_arg_learning_rate_mode,
                     grad_sum_decay=GradSumDecay.NO_DECAY,
                     tail_id_threshold=TailIdThreshold(val=1000, is_ratio=False),
                 )
@@ -545,6 +561,12 @@ def execute_backward_optimizers_(  # noqa C901
                     WeightDecayMode.COWCLIP,
                 ):
                     expected_keys.update(["prev_iter", "row_counter"])
+                    if (
+                        weight_decay_mode == WeightDecayMode.COUNTER
+                        and counter_based_regularization.counter_weight_decay_mode
+                        == CounterWeightDecayMode.ADAGRADW
+                    ):
+                        expected_keys.update(["iter"])
                 assert set(optimizer_states_dict.keys()) == expected_keys
 
         if optimizer in (OptimType.PARTIAL_ROWWISE_ADAM, OptimType.ADAM):
@@ -778,12 +800,13 @@ def _get_grad_from_counter_adagrad(
         l2_wd = 1.0 if counter_weight_decay_mode == CounterWeightDecayMode.L2 else 0.0
 
         if counter_halflife > 0:
-            freq = torch.tensor([counter_halflife]) / row_counter
+            freq = torch.where(
+                row_counter > 0,
+                torch.tensor([counter_halflife]) / row_counter,
+                torch.tensor([1.0]),
+            )
 
-        if isinstance(regularization, CounterBasedRegularizationDefinition):
-            dense_cpu_grad += l2_wd * freq * weight_decay * weights
-        else:
-            dense_cpu_grad += l2_wd * weight_decay * weights
+        dense_cpu_grad += l2_wd * weight_decay * weights
         return dense_cpu_grad, row_counter, freq
 
     def _get_wts_from_counter_adagrad_using_counter(
@@ -863,6 +886,11 @@ def _get_wts_from_counter_adagrad_using_counter(
                     exp_reg_correction = 1.0 - freq * weight_decay * learning_rate
                 elif counter_weight_decay_mode == CounterWeightDecayMode.L2:
                     exp_reg_correction = 1.0 - freq * weight_decay * multiplier
+                elif counter_weight_decay_mode == CounterWeightDecayMode.ADAGRADW:
+                    adjusted_multiplier = multiplier * torch.sqrt(row_counter)
+                    exp_reg_correction = torch.where(
+                        row_counter > 0, 1.0 - weight_decay * learning_rate, 1.0
+                    )
 
         weights = exp_reg_correction * weights - adjusted_multiplier * dense_cpu_grad
         return weights
@@ -1129,6 +1157,14 @@ def test_backward_optimizers_partial_rowwise_adam_bf16_momentum(  # noqa C901
                 WeightDecayMode.COWCLIP,
             ]
         ),
+        counter_weight_decay_mode=st.sampled_from(
+            [
+                CounterWeightDecayMode.NONE,
+                CounterWeightDecayMode.L2,
+                CounterWeightDecayMode.DECOUPLE,
+                CounterWeightDecayMode.ADAGRADW,
+            ]
+        ),
     )
     @settings(
         verbosity=VERBOSITY,
@@ -1152,6 +1188,7 @@ def test_backward_optimizers_adagrad(  # noqa C901
         pooling_mode: PoolingMode,
         use_cpu: bool,
         weight_decay_mode: WeightDecayMode,
+        counter_weight_decay_mode: CounterWeightDecayMode,
     ) -> None:
         if (
             pooling_mode == PoolingMode.NONE
@@ -1172,6 +1209,7 @@ def test_backward_optimizers_adagrad(  # noqa C901
             pooling_mode,
             use_cpu,
             weight_decay_mode,
+            counter_weight_decay_mode=counter_weight_decay_mode,
         )
 
     @given(