Simplify custom op naming for meta functionalization of sparse modules

PaulZhang12 · facebook-github-bot · commit cc271f1bb6cc · 2024-05-09T12:45:36.000-07:00
Summary: Registering custom ops for meta functionalization with ids can lead to hash collisions, resulting in wrong dimensions for a sparse module. This diff replaces custom op naming to just the dimension that is returned, alongside the module type, to ensure that the right dimensions are always returned and simplify the custom op naming logic significantly.

Differential Revision: D57108438
diff --git a/torchrec/ir/tests/test_serializer.py b/torchrec/ir/tests/test_serializer.py
@@ -9,16 +9,23 @@
 
 #!/usr/bin/env python3
 
+import copy
 import unittest
 
 import torch
 from torch import nn
 from torchrec.ir.serializer import JsonSerializer
 
 from torchrec.ir.utils import deserialize_embedding_modules, serialize_embedding_modules
+from torchrec.modules import utils as module_utils
 
 from torchrec.modules.embedding_configs import EmbeddingBagConfig
 from torchrec.modules.embedding_modules import EmbeddingBagCollection
+from torchrec.modules.utils import (
+    operator_registry_state,
+    register_custom_op,
+    register_custom_ops_for_nodes,
+)
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
 
 
@@ -27,13 +34,29 @@ def generate_model(self) -> nn.Module:
         class Model(nn.Module):
             def __init__(self, ebc):
                 super().__init__()
-                self.sparse_arch = ebc
+                self.ebc1 = ebc
+                self.ebc2 = copy.deepcopy(ebc)
+                self.ebc3 = copy.deepcopy(ebc)
+                self.ebc4 = copy.deepcopy(ebc)
+                self.ebc5 = copy.deepcopy(ebc)
 
             def forward(
                 self,
                 features: KeyedJaggedTensor,
-            ) -> KeyedTensor:
-                return self.sparse_arch(features)
+            ) -> torch.Tensor:
+                kt1 = self.ebc1(features)
+                kt2 = self.ebc2(features)
+                kt3 = self.ebc3(features)
+                kt4 = self.ebc4(features)
+                kt5 = self.ebc5(features)
+
+                return (
+                    kt1.values()
+                    + kt2.values()
+                    + kt3.values()
+                    + kt4.values()
+                    + kt5.values()
+                )
 
         tb1_config = EmbeddingBagConfig(
             name="t1",
@@ -65,7 +88,7 @@ def test_serialize_deserialize_ebc(self) -> None:
             offsets=torch.tensor([0, 2, 2, 3, 4]),
         )
 
-        eager_kt = model(id_list_features)
+        eager_out = model(id_list_features)
 
         # Serialize PEA
         model, sparse_fqns = serialize_embedding_modules(model, JsonSerializer)
@@ -78,37 +101,66 @@ def test_serialize_deserialize_ebc(self) -> None:
             preserve_module_call_signature=(tuple(sparse_fqns)),
         )
 
-        # Run forward on ExportedProgram
-        ep_output = ep.module()(id_list_features)
+        total_dim = sum(model.ebc1._lengths_per_embedding)
+        with operator_registry_state.op_registry_lock:
+            # Run forward on ExportedProgram
+            ep_output = ep.module()(id_list_features)
 
-        self.assertTrue(isinstance(ep_output, KeyedTensor))
-        self.assertEqual(eager_kt.keys(), ep_output.keys())
-        self.assertEqual(eager_kt.values().shape, ep_output.values().shape)
+            self.assertEqual(eager_out.shape, ep_output.shape)
 
-        # Deserialize EBC
-        deserialized_model = deserialize_embedding_modules(ep, JsonSerializer)
-
-        self.assertTrue(
-            isinstance(deserialized_model.sparse_arch, EmbeddingBagCollection)
-        )
+            # Only 1 custom op registered, as dimensions of ebc are same
+            self.assertEqual(len(operator_registry_state.op_registry_schema), 1)
 
-        for deserialized_config, org_config in zip(
-            deserialized_model.sparse_arch.embedding_bag_configs(),
-            model.sparse_arch.embedding_bag_configs(),
-        ):
-            self.assertEqual(deserialized_config.name, org_config.name)
-            self.assertEqual(
-                deserialized_config.embedding_dim, org_config.embedding_dim
+            # Check if custom op is registered with the correct name
+            # EmbeddingBagCollection type and total dim
+            self.assertTrue(
+                f"EmbeddingBagCollection_{total_dim}"
+                in operator_registry_state.op_registry_schema
             )
-            self.assertEqual(
-                deserialized_config.num_embeddings, org_config.num_embeddings
+
+            # Reset the op registry
+            operator_registry_state.op_registry_schema = {}
+
+            # Reset lib
+            module_utils.lib = torch.library.Library("custom", "FRAGMENT")
+
+        # Ensure custom op is reregistered
+        register_custom_ops_for_nodes(list(ep.graph_module.graph.nodes))
+
+        with operator_registry_state.op_registry_lock:
+            self.assertTrue(
+                f"EmbeddingBagCollection_{total_dim}"
+                in operator_registry_state.op_registry_schema
             )
-            self.assertEqual(
-                deserialized_config.feature_names, org_config.feature_names
+
+        ep.module()(id_list_features)
+        # Deserialize EBC
+        deserialized_model = deserialize_embedding_modules(ep, JsonSerializer)
+
+        for i in range(5):
+            ebc_name = f"ebc{i + 1}"
+            self.assertTrue(
+                isinstance(
+                    getattr(deserialized_model, ebc_name), EmbeddingBagCollection
+                )
             )
 
+            for deserialized_config, org_config in zip(
+                getattr(deserialized_model, ebc_name).embedding_bag_configs(),
+                getattr(model, ebc_name).embedding_bag_configs(),
+            ):
+                self.assertEqual(deserialized_config.name, org_config.name)
+                self.assertEqual(
+                    deserialized_config.embedding_dim, org_config.embedding_dim
+                )
+                self.assertEqual(
+                    deserialized_config.num_embeddings, org_config.num_embeddings
+                )
+                self.assertEqual(
+                    deserialized_config.feature_names, org_config.feature_names
+                )
+
         # Run forward on deserialized model
-        deserialized_kt = deserialized_model(id_list_features)
+        deserialized_out = deserialized_model(id_list_features)
 
-        self.assertEqual(eager_kt.keys(), deserialized_kt.keys())
-        self.assertEqual(eager_kt.values().shape, deserialized_kt.values().shape)
+        self.assertEqual(eager_out.shape, deserialized_out.shape)
diff --git a/torchrec/modules/embedding_modules.py b/torchrec/modules/embedding_modules.py
@@ -217,7 +217,7 @@ def _non_strict_exporting_forward(
             features.offsets_or_none(),
         ]  # if want to include the weights: `+ [bag.weight for bag in self.embedding_bags.values()]`
         dims = [sum(self._lengths_per_embedding)]
-        ebc_op = register_custom_op(self, dims)
+        ebc_op = register_custom_op(type(self).__name__, dims)
         outputs = ebc_op(arg_list, batch_size)
         return KeyedTensor(
             keys=self._embedding_names,
diff --git a/torchrec/modules/utils.py b/torchrec/modules/utils.py
@@ -50,8 +50,6 @@ class OpRegistryState:
 
     # operator schema: {class}.{id} => op_name
     op_registry_schema: Dict[str, str] = {}
-    # operator counter: {class} => count
-    op_registry_counter: Dict[str, int] = defaultdict(int)
 
 
 operator_registry_state = OpRegistryState()
@@ -274,7 +272,8 @@ def _permute_indices(indices: List[int], permute: List[int]) -> List[int]:
 #  a list of tensors as output. The operator is registered with the name of
 #  {module_class_name}_{instance_count}
 def register_custom_op(
-    module: torch.nn.Module, dims: List[int]
+    module_name: str,
+    dims: List[int],
 ) -> Callable[[List[Optional[torch.Tensor]], int], List[torch.Tensor]]:
     """
     Register a customized operator.
@@ -286,51 +285,68 @@ def register_custom_op(
 
     global operator_registry_state
 
-    m_name: str = type(module).__name__
-    op_id: str = f"{m_name}_{id(module)}"
+    dims_str = "_".join([str(d) for d in dims])
     with operator_registry_state.op_registry_lock:
-        if op_id in operator_registry_state.op_registry_schema:
-            op_name: str = operator_registry_state.op_registry_schema[op_id]
-        else:
-            operator_registry_state.op_registry_counter[m_name] += 1
-            op_name: str = (
-                f"{m_name}_{operator_registry_state.op_registry_counter[m_name]}"
-            )
-            operator_registry_state.op_registry_schema[op_id] = op_name
-
-            def custom_op(
-                values: List[Optional[torch.Tensor]],
-                batch_size: int,
-            ) -> List[torch.Tensor]:
-                device = None
-                for v in values:
-                    if v is not None:
-                        device = v.device
-                        break
-                else:
-                    raise AssertionError(
-                        f"Custom op {op_name} expects at least one input tensor"
-                    )
-
-                return [
-                    torch.empty(
-                        batch_size,
-                        dim,
-                        device=device,
-                    )
-                    for dim in dims
-                ]
-
-            schema_string = f"{op_name}(Tensor?[] values, int batch_size) -> Tensor[]"
-            operator_registry_state.op_registry_schema[op_name] = schema_string
-            # Register schema
-            lib.define(schema_string)
-
-            # Register implementation
-            lib.impl(op_name, custom_op, "CPU")
-            lib.impl(op_name, custom_op, "CUDA")
-
-            # Register meta formula
-            lib.impl(op_name, custom_op, "Meta")
+        op_name: str = f"{module_name}_{dims_str}"
+
+        if op_name in operator_registry_state.op_registry_schema:
+            return getattr(torch.ops.custom, op_name)
+
+        def custom_op(
+            values: List[Optional[torch.Tensor]],
+            batch_size: int,
+        ) -> List[torch.Tensor]:
+            device = None
+            for v in values:
+                if v is not None:
+                    device = v.device
+                    break
+            else:
+                raise AssertionError(
+                    f"Custom op {op_name} expects at least one input tensor"
+                )
+
+            return [
+                torch.empty(
+                    batch_size,
+                    dim,
+                    device=device,
+                )
+                for dim in dims
+            ]
+
+        schema_string = f"{op_name}(Tensor?[] values, int batch_size) -> Tensor[]"
+        operator_registry_state.op_registry_schema[op_name] = schema_string
+        # Register schema
+        lib.define(schema_string)
+
+        # Register implementation
+        lib.impl(op_name, custom_op, "CPU")
+        lib.impl(op_name, custom_op, "CUDA")
+
+        # Register meta formula
+        lib.impl(op_name, custom_op, "Meta")
 
         return getattr(torch.ops.custom, op_name)
+
+
+def register_custom_ops_for_nodes(
+    nodes: List[torch.fx.Node],
+) -> None:
+    """
+    Given a list of nodes, register custom ops if they exist in the nodes.
+    Required for deserialization if in different runtime environments
+
+    Args:
+        nodes: list of nodes
+    """
+
+    for node in nodes:
+        if "custom." in str(node.target):
+            # torch.ops.custom.EmbeddingBagCollection_100.default
+            # number represents dimension
+            op_name = str(node.target).split(".")[-2]
+            register_custom_op(
+                op_name.split("_")[0],
+                [int(dim) for dim in op_name.split("_")[1:]],
+            )