Add logic for fqn_to_feature_names (#3059)

aliafzal · facebook-github-bot · commit 0b110fa66c4d · 2025-06-08T15:26:11.000-07:00
Summary:

# This Diff
Added implementation for fqn_to_feature_names method along with initial testing framework and UTs for fqn_to_feature_names
# ModelDeltaTracker Context

ModelDeltaTracker is a utility for tracking and retrieving unique IDs and their corresponding embeddings or states from embedding modules in model using Torchrec. It's particularly useful for:

1. Identifying which embedding rows were accessed during model execution
2. Retrieving the latest delta or unique rows for a model
3. Computing top-k changed embeddings
4. Supporting streaming updated embeddings between systems during online training

Differential Revision: D75908963
diff --git a/torchrec/distributed/model_tracker/model_delta_tracker.py b/torchrec/distributed/model_tracker/model_delta_tracker.py
@@ -6,7 +6,9 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-strict
-from typing import Dict, List, Optional, Union
+import logging as logger
+from collections import Counter, OrderedDict
+from typing import Dict, Iterable, List, Optional, Union
 
 import torch
 
@@ -59,11 +61,15 @@ def __init__(
         consumers: Optional[List[str]] = None,
         delete_on_read: bool = True,
         mode: TrackingMode = TrackingMode.ID_ONLY,
+        fqns_to_skip: Iterable[str] = (),
     ) -> None:
         self._model = model
         self._consumers: List[str] = consumers or [self.DEFAULT_CONSUMER]
         self._delete_on_read = delete_on_read
         self._mode = mode
+        self._fqn_to_feature_map: Dict[str, List[str]] = {}
+        self._fqns_to_skip: Iterable[str] = fqns_to_skip
+        self.fqn_to_feature_names()
         pass
 
     def record_lookup(self, kjt: KeyedJaggedTensor, states: torch.Tensor) -> None:
@@ -85,14 +91,71 @@ def get_delta(self, consumer: Optional[str] = None) -> Dict[str, DeltaRows]:
         """
         return {}
 
-    def fqn_to_feature_names(self, module: nn.Module) -> Dict[str, List[str]]:
+    def fqn_to_feature_names(self) -> Dict[str, List[str]]:
         """
         Returns a mapping from FQN to feature names for a given module.
 
         Args:
             module (nn.Module): the module to retrieve feature names for.
         """
-        return {}
+        if (self._fqn_to_feature_map is not None) and len(self._fqn_to_feature_map) > 0:
+            return self._fqn_to_feature_map
+
+        table_to_feature_names: Dict[str, List[str]] = OrderedDict()
+        table_to_fqn: Dict[str, str] = OrderedDict()
+        for fqn, named_module in self._model.named_modules():
+            split_fqn = fqn.split(".")
+
+            should_skip = False
+            for fqn_to_skip in self._fqns_to_skip:
+                if fqn_to_skip in split_fqn:
+                    logger.info(f"Skipping {fqn} because it is part of fqns_to_skip")
+                    should_skip = True
+                    break
+            if should_skip:
+                continue
+
+            # Using FQNs of the embedding and mapping them to features as state_dict() API uses these to key states.
+            if isinstance(named_module, SUPPORTED_MODULES):
+                for table_name, config in named_module._table_name_to_config.items():
+                    logger.info(
+                        f"Found {table_name} for {fqn} with features {config.feature_names}"
+                    )
+                    table_to_feature_names[table_name] = config.feature_names
+            for table_name in table_to_feature_names:
+                # Using the split FQN to get the exact table name matching. Otherwise, checking "table_name in fqn"
+                # will incorrectly match fqn with all the table names that have the same prefix
+                if table_name in split_fqn:
+                    embedding_fqn = fqn.replace("_dmp_wrapped_module.module.", "")
+                    if table_name in table_to_fqn:
+                        # Sanity check for validating that we don't have more then one tbale mapping to same fqn.
+                        logger.warning(
+                            f"Override {table_to_fqn[table_name]} with {embedding_fqn} for entry {table_name}"
+                        )
+                    table_to_fqn[table_name] = embedding_fqn
+            logger.info(f"Table to fqn: {table_to_fqn}")
+        flatten_names = [
+            name for names in table_to_feature_names.values() for name in names
+        ]
+        # Some ads models have duplicate feature names, so we are relaxing the condition in case we come across duplicate feature names.
+        if len(set(flatten_names)) != len(flatten_names):
+            counts = Counter(flatten_names)
+            duplicates = [item for item, count in counts.items() if count > 1]
+            logger.warning(f"duplicate feature names found: {duplicates}")
+
+        fqn_to_feature_names: Dict[str, List[str]] = OrderedDict()
+        for table_name in table_to_feature_names:
+            if table_name not in table_to_fqn:
+                # This is likely unexpected, where we can't locate the FQN associated with this table.
+                logger.warning(
+                    f"Table {table_name} not found in {table_to_fqn}, skipping"
+                )
+                continue
+            fqn_to_feature_names[table_to_fqn[table_name]] = table_to_feature_names[
+                table_name
+            ]
+        self._fqn_to_feature_map = fqn_to_feature_names
+        return fqn_to_feature_names
 
     def clear(self, consumer: Optional[str] = None) -> None:
         """
diff --git a/torchrec/distributed/model_tracker/tests/test_model_delta_tracker.py b/torchrec/distributed/model_tracker/tests/test_model_delta_tracker.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+import os
+from dataclasses import dataclass
+from typing import cast, Dict, List
+
+import torch
+import torchrec
+from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType
+
+from parameterized import parameterized
+from torch import distributed as dist, nn
+from torch.testing._internal.common_distributed import MultiProcessTestCase
+from torchrec.distributed import DistributedModelParallel
+from torchrec.distributed.embedding import EmbeddingCollectionSharder
+from torchrec.distributed.embedding_types import ModuleSharder, ShardingType
+from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder
+from torchrec.distributed.model_tracker.model_delta_tracker import ModelDeltaTracker
+from torchrec.distributed.model_tracker.tests.utils import (
+    EmbeddingTableProps,
+    generate_planner_constraints,
+    TestEBCModel,
+    TestECModel,
+)
+
+from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+from torchrec.modules.embedding_configs import (
+    EmbeddingBagConfig,
+    EmbeddingConfig,
+    PoolingType,
+)
+
+NUM_EMBEDDINGS: int = 16
+EMBEDDING_DIM: int = 256
+
+
+class ModelDeltaTrackerTest(MultiProcessTestCase):
+    # pyre-fixme[2]: Parameter must be annotated.
+    def __init__(self, methodName="runTest") -> None:
+        super().__init__(methodName)
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def _get_store(self) -> dist.FileStore:
+        return dist.FileStore(self.file_name, self.world_size)
+
+    def _get_process_group(self) -> dist.ProcessGroup:
+        store = self._get_store()
+        dist.init_process_group(
+            "nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        return dist.distributed_c10d._get_default_group()
+
+    def _get_models(
+        self,
+        embedding_type: str,
+        tables: Dict[str, EmbeddingTableProps],
+        optimizer_type: OptimType = OptimType.ADAM,
+    ) -> DistributedModelParallel:
+        torch.manual_seed(0)
+        torch.cuda.set_device(self.rank)
+        pg = self._get_process_group()
+        test_model = (
+            TestECModel(
+                tables=[
+                    EmbeddingConfig(
+                        name=table_name,
+                        embedding_dim=table.embedding_dim,
+                        num_embeddings=table.num_embeddings,
+                        feature_names=table.feature_names,
+                    )
+                    for table_name, table in tables.items()
+                ]
+            )
+            if embedding_type == "EC"
+            else TestEBCModel(
+                tables=[
+                    EmbeddingBagConfig(
+                        name=table_name,
+                        embedding_dim=table.embedding_dim,
+                        num_embeddings=table.num_embeddings,
+                        feature_names=table.feature_names,
+                        pooling=table.pooling,
+                    )
+                    for table_name, table in tables.items()
+                ]
+            )
+        )
+        planner = EmbeddingShardingPlanner(
+            topology=Topology(self.world_size, "cuda"),
+            constraints=generate_planner_constraints(tables),
+        )
+        sharders = [
+            cast(
+                ModuleSharder[nn.Module],
+                EmbeddingCollectionSharder(
+                    fused_params={
+                        "optimizer": optimizer_type,
+                        "beta1": 0.9,
+                        "beta2": 0.99,
+                    }
+                ),
+            ),
+            cast(
+                ModuleSharder[nn.Module],
+                EmbeddingBagCollectionSharder(
+                    fused_params={"optimizer": optimizer_type}
+                ),
+            ),
+        ]
+        plan = planner.collective_plan(test_model, sharders, pg)
+        return DistributedModelParallel(
+            module=test_model,
+            device=torch.device(f"cuda:{self.rank}"),
+            env=torchrec.distributed.ShardingEnv.from_process_group(pg),
+            plan=plan,
+            sharders=sharders,
+        )
+
+    @dataclass
+    class ModelDeltaTrackerInputTestParams:
+        # input parameters
+        embedding_type: str
+        embedding_tables: Dict[str, EmbeddingTableProps]
+        fqns_to_skip: List[str]
+
+    @dataclass
+    class FqnToFeatureNamesOutputTestParams:
+        # expected output parameters
+        expected_fqn_to_feature_names: Dict[str, List[str]]
+
+    @parameterized.expand(
+        [
+            (
+                "EC_model_test",
+                ModelDeltaTrackerInputTestParams(
+                    embedding_type="EC",
+                    embedding_tables={
+                        "sparse_table_1": EmbeddingTableProps(
+                            num_embeddings=NUM_EMBEDDINGS,
+                            embedding_dim=EMBEDDING_DIM,
+                            sharding=ShardingType.ROW_WISE,
+                            feature_names=["f1", "f2", "f3"],
+                            pooling=PoolingType.NONE,
+                        ),
+                        "sparse_table_2": EmbeddingTableProps(
+                            num_embeddings=NUM_EMBEDDINGS,
+                            embedding_dim=EMBEDDING_DIM,
+                            sharding=ShardingType.ROW_WISE,
+                            feature_names=["f4", "f5", "f6"],
+                            pooling=PoolingType.NONE,
+                        ),
+                    },
+                    fqns_to_skip=[],
+                ),
+                FqnToFeatureNamesOutputTestParams(
+                    expected_fqn_to_feature_names={
+                        "ec.embeddings.sparse_table_1": ["f1", "f2", "f3"],
+                        "ec.embeddings.sparse_table_2": ["f4", "f5", "f6"],
+                    },
+                ),
+            ),
+            (
+                "EBC_model_test",
+                ModelDeltaTrackerInputTestParams(
+                    embedding_type="EBC",
+                    embedding_tables={
+                        "sparse_table_1": EmbeddingTableProps(
+                            num_embeddings=NUM_EMBEDDINGS,
+                            embedding_dim=EMBEDDING_DIM,
+                            sharding=ShardingType.ROW_WISE,
+                            feature_names=["f1", "f2", "f3"],
+                            pooling=PoolingType.SUM,
+                        ),
+                        "sparse_table_2": EmbeddingTableProps(
+                            num_embeddings=NUM_EMBEDDINGS,
+                            embedding_dim=EMBEDDING_DIM,
+                            sharding=ShardingType.ROW_WISE,
+                            feature_names=["f4", "f5", "f6"],
+                            pooling=PoolingType.SUM,
+                        ),
+                    },
+                    fqns_to_skip=[],
+                ),
+                FqnToFeatureNamesOutputTestParams(
+                    expected_fqn_to_feature_names={
+                        "ebc.embedding_bags.sparse_table_1": ["f1", "f2", "f3"],
+                        "ebc.embedding_bags.sparse_table_2": ["f4", "f5", "f6"],
+                    },
+                ),
+            ),
+            (
+                "EC_model_test_with_duplicate_feature_names",
+                ModelDeltaTrackerInputTestParams(
+                    embedding_type="EC",
+                    embedding_tables={
+                        "sparse_table_1": EmbeddingTableProps(
+                            num_embeddings=NUM_EMBEDDINGS,
+                            embedding_dim=EMBEDDING_DIM,
+                            sharding=ShardingType.ROW_WISE,
+                            feature_names=["f1", "f2", "f3"],
+                            pooling=PoolingType.NONE,
+                        ),
+                        "sparse_table_2": EmbeddingTableProps(
+                            num_embeddings=NUM_EMBEDDINGS,
+                            embedding_dim=EMBEDDING_DIM,
+                            sharding=ShardingType.ROW_WISE,
+                            feature_names=["f3", "f4", "f5"],
+                            pooling=PoolingType.NONE,
+                        ),
+                    },
+                    fqns_to_skip=[],
+                ),
+                FqnToFeatureNamesOutputTestParams(
+                    expected_fqn_to_feature_names={
+                        "ec.embeddings.sparse_table_1": ["f1", "f2", "f3"],
+                        "ec.embeddings.sparse_table_2": ["f3", "f4", "f5"],
+                    },
+                ),
+            ),
+            (
+                "EBC_model_test_fqns_to_skip",
+                ModelDeltaTrackerInputTestParams(
+                    embedding_type="EBC",
+                    embedding_tables={
+                        "sparse_table_1": EmbeddingTableProps(
+                            num_embeddings=NUM_EMBEDDINGS,
+                            embedding_dim=EMBEDDING_DIM,
+                            sharding=ShardingType.ROW_WISE,
+                            feature_names=["f1", "f2", "f3"],
+                            pooling=PoolingType.SUM,
+                        ),
+                        "sparse_table_2": EmbeddingTableProps(
+                            num_embeddings=NUM_EMBEDDINGS,
+                            embedding_dim=EMBEDDING_DIM,
+                            sharding=ShardingType.ROW_WISE,
+                            feature_names=["f4", "f5", "f6"],
+                            pooling=PoolingType.SUM,
+                        ),
+                    },
+                    fqns_to_skip=["sparse_table_1"],
+                ),
+                FqnToFeatureNamesOutputTestParams(
+                    expected_fqn_to_feature_names={
+                        "ebc.embedding_bags.sparse_table_2": ["f4", "f5", "f6"],
+                    },
+                ),
+            ),
+        ]
+    )
+    def test_fqn_to_feature_names(
+        self,
+        _test_name: str,
+        input_params: ModelDeltaTrackerInputTestParams,
+        output_params: FqnToFeatureNamesOutputTestParams,
+    ) -> None:
+        model = self._get_models(
+            input_params.embedding_type, input_params.embedding_tables
+        )
+        model_dt = ModelDeltaTracker(model, fqns_to_skip=input_params.fqns_to_skip)
+        self.assertEqual(
+            model_dt.fqn_to_feature_names(), output_params.expected_fqn_to_feature_names
+        )
diff --git a/torchrec/distributed/model_tracker/tests/utils.py b/torchrec/distributed/model_tracker/tests/utils.py