pytorch · aliafzal · Jun 12, 2025
diff --git a/torchrec/distributed/model_tracker/model_delta_tracker.py b/torchrec/distributed/model_tracker/model_delta_tracker.py
@@ -6,7 +6,9 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-strict
-from typing import Dict, List, Optional, Union
+import logging as logger
+from collections import Counter, OrderedDict
+from typing import Dict, Iterable, List, Optional, Union
 
 import torch
 
@@ -30,7 +32,7 @@
 }
 
 # Tracking is current only supported for ShardedEmbeddingCollection and ShardedEmbeddingBagCollection.
-SUPPORTED_MODULES = Union[ShardedEmbeddingCollection, ShardedEmbeddingBagCollection]
+SUPPORTED_MODULES = (ShardedEmbeddingCollection, ShardedEmbeddingBagCollection)
 
 
 class ModelDeltaTracker:
@@ -49,6 +51,8 @@ class ModelDeltaTracker:
             call.
         delete_on_read (bool, optional): whether to delete the tracked ids after all consumers have read them.
         mode (TrackingMode, optional): tracking mode to use from supported tracking modes. Default: TrackingMode.ID_ONLY.
+        fqns_to_skip (Iterable[str], optional): list of FQNs to skip tracking. Default: None.
+
     """
 
     DEFAULT_CONSUMER: str = "default"
@@ -59,11 +63,15 @@ def __init__(
         consumers: Optional[List[str]] = None,
         delete_on_read: bool = True,
         mode: TrackingMode = TrackingMode.ID_ONLY,
+        fqns_to_skip: Iterable[str] = (),
     ) -> None:
         self._model = model
         self._consumers: List[str] = consumers or [self.DEFAULT_CONSUMER]
         self._delete_on_read = delete_on_read
         self._mode = mode
+        self._fqn_to_feature_map: Dict[str, List[str]] = {}
+        self._fqns_to_skip: Iterable[str] = fqns_to_skip
+        self.fqn_to_feature_names()
         pass
 
     def record_lookup(self, kjt: KeyedJaggedTensor, states: torch.Tensor) -> None:
@@ -85,14 +93,70 @@ def get_delta(self, consumer: Optional[str] = None) -> Dict[str, DeltaRows]:
         """
         return {}
 
-    def fqn_to_feature_names(self, module: nn.Module) -> Dict[str, List[str]]:
+    def fqn_to_feature_names(self) -> Dict[str, List[str]]:
         """
-        Returns a mapping from FQN to feature names for a given module.
-
-        Args:
-            module (nn.Module): the module to retrieve feature names for.
+        Returns a mapping of FQN to feature names from all Supported Modules [EmbeddingCollection and EmbeddingBagCollection] present in the given model.
         """
-        return {}
+        if (self._fqn_to_feature_map is not None) and len(self._fqn_to_feature_map) > 0:
+            return self._fqn_to_feature_map
+
+        table_to_feature_names: Dict[str, List[str]] = OrderedDict()
+        table_to_fqn: Dict[str, str] = OrderedDict()
+        for fqn, named_module in self._model.named_modules():
+            split_fqn = fqn.split(".")
+            # Skipping partial FQNs present in fqns_to_skip
+            # TODO: Validate if we need to support more complex patterns for skipping fqns
+            should_skip = False
+            for fqn_to_skip in self._fqns_to_skip:
+                if fqn_to_skip in split_fqn:
+                    logger.info(f"Skipping {fqn} because it is part of fqns_to_skip")
+                    should_skip = True
+                    break
+            if should_skip:
+                continue
+
+            # Using FQNs of the embedding and mapping them to features as state_dict() API uses these to key states.
+            if isinstance(named_module, SUPPORTED_MODULES):
+                for table_name, config in named_module._table_name_to_config.items():
+                    logger.info(
+                        f"Found {table_name} for {fqn} with features {config.feature_names}"
+                    )
+                    table_to_feature_names[table_name] = config.feature_names
+            for table_name in table_to_feature_names:
+                # Using the split FQN to get the exact table name matching. Otherwise, checking "table_name in fqn"
+                # will incorrectly match fqn with all the table names that have the same prefix
+                if table_name in split_fqn:
+                    embedding_fqn = fqn.replace("_dmp_wrapped_module.module.", "")
+                    if table_name in table_to_fqn:
+                        # Sanity check for validating that we don't have more then one table mapping to same fqn.
+                        logger.warning(
+                            f"Override {table_to_fqn[table_name]} with {embedding_fqn} for entry {table_name}"
+                        )
+                    table_to_fqn[table_name] = embedding_fqn
+            logger.info(f"Table to fqn: {table_to_fqn}")
+        flatten_names = [
+            name for names in table_to_feature_names.values() for name in names
+        ]
+        # TODO: Validate if there is a better way to handle duplicate feature names.
+        # Logging a warning if duplicate feature names are found across tables, but continue execution as this could be a valid case.
+        if len(set(flatten_names)) != len(flatten_names):
+            counts = Counter(flatten_names)
+            duplicates = [item for item, count in counts.items() if count > 1]
+            logger.warning(f"duplicate feature names found: {duplicates}")
+
+        fqn_to_feature_names: Dict[str, List[str]] = OrderedDict()
+        for table_name in table_to_feature_names:
+            if table_name not in table_to_fqn:
+                # This is likely unexpected, where we can't locate the FQN associated with this table.
+                logger.warning(
+                    f"Table {table_name} not found in {table_to_fqn}, skipping"
+                )
+                continue
+            fqn_to_feature_names[table_to_fqn[table_name]] = table_to_feature_names[
+                table_name
+            ]
+        self._fqn_to_feature_map = fqn_to_feature_names
+        return fqn_to_feature_names
 
     def clear(self, consumer: Optional[str] = None) -> None:
         """