pytorch
diff --git a/‎py/torch_tensorrt/dynamo/backend/backends.py
Lines changed: 6 additions & 1 deletion b/‎py/torch_tensorrt/dynamo/backend/backends.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎py/torch_tensorrt/dynamo/conversion/converter_registry.py
Lines changed: 2 additions & 1 deletion b/‎py/torch_tensorrt/dynamo/conversion/converter_registry.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎py/torch_tensorrt/dynamo/lowering/__init__.py
Lines changed: 0 additions & 1 deletion b/‎py/torch_tensorrt/dynamo/lowering/__init__.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎py/torch_tensorrt/dynamo/partitioning/__init__.py
Lines changed: 4 additions & 0 deletions b/‎py/torch_tensorrt/dynamo/partitioning/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py
Lines changed: 244 additions & 0 deletions b/‎py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py
Lines changed: 244 additions & 0 deletions
diff --git a/‎py/torch_tensorrt/dynamo/lowering/_partition.py renamed to ‎py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py
Lines changed: 2 additions & 33 deletions b/‎py/torch_tensorrt/dynamo/lowering/_partition.py renamed to ‎py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py
Lines changed: 2 additions & 33 deletions
diff --git a/‎py/torch_tensorrt/dynamo/partitioning/common.py
Lines changed: 39 additions & 0 deletions b/‎py/torch_tensorrt/dynamo/partitioning/common.py
Lines changed: 39 additions & 0 deletions
@@ -11,7 +11,7 @@
 from torch_tensorrt.dynamo.lowering._pre_aot_lowering import (
     pre_aot_substitutions,
 )
-from torch_tensorrt.dynamo.lowering._partition import (
+from torch_tensorrt.dynamo.partitioning import (
     partition,
     get_submod_inputs,
 )
@@ -131,6 +131,11 @@ def _compile_module(
     # Iterate over all components that can be accelerated
     # Generate the corresponding TRT Module for those
     for name, _ in partitioned_module.named_children():
+
+        # Criteria for a module to be convertible to TRT
+        if "_run_on_acc" not in name:
+            continue
+
         submodule = getattr(partitioned_module, name)
 
         # Get submodule inputs
 
@@ -305,7 +305,8 @@ def unique_targets(self):
         """Returns the set of unique converter targets stored across all registries"""
         return set.union(*[set(registry.keys()) for registry in self.registries])
 
-    def qualified_name_or_str(self, target: Target) -> str:
+    @staticmethod
+    def qualified_name_or_str(target: Target) -> str:
         """Returns string representation of an FX Node target"""
         if isinstance(target, str):
             return target
 
@@ -5,6 +5,5 @@
     SUBSTITUTION_REGISTRY,
     register_substitution,
 )
-from ._partition import partition, get_submod_inputs, DEFAULT_SINGLE_NODE_PARTITIONS
 from .substitutions import *
 from ._fusers import *
@@ -0,0 +1,4 @@
+from .common import get_submod_inputs
+from ._adjacency_partitioner import (
+    partition,
+)
@@ -0,0 +1,244 @@
+import logging
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import torch
+
+from torch.fx.passes.splitter_base import (
+    Subgraph,
+    _SplitterBase,
+    _SplitterSettingBase,
+    FxNetAccNodesFinder,
+    FxNetAccFusionsFinder,
+)
+import torch.fx.passes.operator_support as ops
+from torch.fx.passes.tools_common import NodeSet, CALLABLE_NODE_OPS
+from torch.fx.node import Target
+
+from torch_tensorrt.dynamo.conversion.converter_registry import ConverterRegistry
+from .common import DEFAULT_SINGLE_NODE_PARTITIONS
+from torch_tensorrt.dynamo._defaults import MIN_BLOCK_SIZE
+
+from torch_tensorrt.dynamo import DYNAMO_CONVERTERS as CONVERTERS
+
+
+logger = logging.getLogger(__name__)
+
+
+class OpSupportTester(ops.OperatorSupportBase):
+    """Class to determine whether operators within a module are supported"""
+
+    def __init__(self, torch_executed_ops: Sequence[Target] = set()) -> None:
+        super().__init__()
+
+        # Initialize sets of supported/unsupported operators
+        self.supported_operators = {}
+        self.unsupported_operators = {}
+        self.torch_executed_ops = torch_executed_ops
+
+    def is_node_supported(
+        self, submodules: Dict[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        node_name = ConverterRegistry.qualified_name_or_str(node.target)
+
+        if node in CONVERTERS and node_name not in self.torch_executed_ops:
+            # If node is a proper, supported computational node, store the operator
+            if not node.is_impure():
+                if node_name not in self.supported_operators:
+                    self.supported_operators[node_name] = 1
+                else:
+                    self.supported_operators[node_name] += 1
+
+            return True
+        else:
+            if not node.is_impure():
+                if node_name not in self.unsupported_operators:
+                    self.unsupported_operators[node_name] = 1
+                else:
+                    self.unsupported_operators[node_name] += 1
+
+            return False
+
+    def print_support_overview(self, num_trt_blocks: Optional[int] = None):
+        if num_trt_blocks is not None:
+            logger.debug(
+                f"\nNumber of TensorRT-Accelerated Engines Generated: {num_trt_blocks}"
+            )
+
+        # Reformat support messages for debugger to print node overview as a single string
+        supported_nodes_str = "\nSupported Nodes:\n"
+        for node_name, count in self.supported_operators.items():
+            supported_nodes_str += f"- {node_name} + Operator Count: {count}\n"
+
+        logger.debug(supported_nodes_str)
+
+        if self.unsupported_operators:
+            unsupported_nodes_str = "\nUnsupported or Excluded Nodes:\n"
+            for node_name, count in self.unsupported_operators.items():
+                unsupported_nodes_str += f"- {node_name} + Operator Count: {count}\n"
+
+            logger.debug(unsupported_nodes_str)
+        else:
+            logger.debug("\nAll Nodes Supported\n")
+
+
+class TRTPartitioner(_SplitterBase):
+    """Partitioner to split an FX graph into subgraphs based on operator support
+
+    Adapted from, and modified for the Torch-TensorRT Dynamo case:
+    https://github.com/pytorch/pytorch/blob/93f538db355ea10c684a57f7a632ed03292ef98f/torch/fx/passes/splitter_base.py#L256C9-L871
+
+    Args:
+        module: FX GraphModule to partition
+        operator_support: OperatorSupport class describing allowed operators
+        allowed_single_node_partition_ops: Nodes which can be included in single-node partitons.
+            Generally useful for module-level exclusion ops which are intensive despite being single functions
+        min_block_size: Minimum number of computational operators per block
+    Returns:
+        torch.fx.GraphModule
+    """
+
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        operator_support: ops.OperatorSupportBase,
+        allowed_single_node_partition_ops: Optional[
+            Sequence[str]
+        ] = DEFAULT_SINGLE_NODE_PARTITIONS,
+        min_block_size: int = MIN_BLOCK_SIZE,
+    ):
+        """
+        Preprocesses graph before splitting:
+        - finds nodes supported by ACC,
+        - finds fusion groups for ACC nodes having non-tensor IO,
+        - builds a graph of direct dependencies,
+        - builds a map of fused nodes to their fusions.
+        As a result we get self.acc_nodes, self.deps and self.fusions.
+        """
+        assert isinstance(module, torch.fx.GraphModule)
+
+        self.module = module
+
+        self.settings = _SplitterSettingBase(
+            min_acc_module_size=min_block_size,
+            allow_non_tensor=True,
+        )
+        self.operator_support = operator_support
+
+        # Get all accelerated nodes based on operator support conditions
+        self.acc_nodes = FxNetAccNodesFinder(
+            self.module, self.operator_support, self.settings.allow_non_tensor
+        )()
+
+        if self.settings.skip_fusion:
+            self.fusions = {}
+        else:
+            self.fusions = FxNetAccFusionsFinder(module, set(self.acc_nodes))()
+
+        # Modify deps to add more deps for fused nodes
+        self.deps = self.find_deps()
+        self.update_deps_for_fusions()
+
+        self.non_acc_submodule_name = "_run_on_gpu_"
+        self._node_submodule_map: Dict[str, str] = {}
+
+        self.num_trt_accelerated_subgraphs = None
+        self.allowed_single_node_partition_ops = allowed_single_node_partition_ops
+
+    def remove_small_acc_subgraphs(self, subgraphs: List[Subgraph]) -> List[Subgraph]:
+        """
+        This pass finds ACC submodules with less than specified size and merges
+        them with adjacent GPU submodules.
+        """
+        result: List[Subgraph] = []
+        for subgraph in subgraphs:
+            if subgraph.is_acc:
+                if len(subgraph.nodes) >= self.settings.min_acc_module_size or any(
+                    ConverterRegistry.qualified_name_or_str(node.target)
+                    in self.allowed_single_node_partition_ops
+                    for node in subgraph.nodes
+                ):
+                    result.append(subgraph)
+                else:
+                    logger.debug(
+                        "Eliminating acc subgraph because it's smaller than the threshold: "
+                        f"{len(subgraph.nodes)} < {self.settings.min_acc_module_size}"
+                    )
+                    if result:
+                        result[-1].nodes.extend(subgraph.nodes)
+                    else:
+                        subgraph.is_acc = False
+                        result.append(subgraph)
+            else:
+                if result and not result[-1].is_acc:
+                    result[-1].nodes.extend(subgraph.nodes)
+                else:
+                    result.append(subgraph)
+        return result
+
+    def partition_graph(self) -> torch.fx.GraphModule:
+        """Partitions the GraphModule into subgraphs based on operator support
+
+        Returns a GraphModule with submodules for each segment
+        """
+        # Delegate nodes based on operator coverage
+        subgraphs = self.put_nodes_into_subgraphs()
+
+        # Remove segments smaller than the block size (with exceptions)
+        subgraphs = self.remove_small_acc_subgraphs(subgraphs)
+
+        # Set the number of TRT engines to be generated
+        self.num_trt_accelerated_subgraphs = len([s for s in subgraphs if s.is_acc])
+
+        # Tag the accelerated nodes and split the graph accordingly
+        self.tag(subgraphs)
+        return self.split()
+
+    def starter_nodes(self) -> Tuple[NodeSet, NodeSet]:
+        """Generates starter nodes for partitioning + segmentation"""
+        # Starter accelerated nodes are all callable accelerated ops
+        starter_acc_nodes = {
+            node for node in self.acc_nodes if node.op in CALLABLE_NODE_OPS
+        }
+
+        # Started non-accelerated nodes are the rest of the callable nodes
+        starter_non_acc_nodes = {
+            node
+            for node in self.module.graph.nodes
+            if (node not in starter_acc_nodes and node.op in CALLABLE_NODE_OPS)
+        }
+
+        return starter_non_acc_nodes, starter_acc_nodes
+
+
+def partition(
+    gm: torch.fx.GraphModule,
+    verbose: bool = True,
+    min_block_size: int = MIN_BLOCK_SIZE,
+    torch_executed_ops: Sequence[Target] = set(),
+) -> torch.fx.GraphModule:
+    """Partition an FX GraphModule with aten ops into TRT engines
+    Partitioning is based on converter operator support
+
+    Args:
+        gm: FX GraphModule to partition
+        verbose: Bool representing whether to print operator support
+        min_block_size: Minimum number of operators per TRT-Engine Block
+        torch_executed_ops: Sequence of operations to run in Torch, regardless of converter coverage
+    Returns:
+        torch.fx.GraphModule
+    """
+    # Ensure graph is clean prior to partitioning
+    gm.graph.eliminate_dead_code()
+    gm.graph.lint()
+    gm.recompile()
+
+    # Construct
+    supported_ops = OpSupportTester(torch_executed_ops=torch_executed_ops)
+    partitioner = TRTPartitioner(gm, supported_ops, min_block_size=min_block_size)
+
+    partitioned_graph = partitioner.partition_graph()
+
+    if verbose:
+        supported_ops.print_support_overview(partitioner.num_trt_accelerated_subgraphs)
+
+    return partitioned_graph
@@ -1,12 +1,12 @@
 import logging
-from typing import Dict, List, Optional, Sequence, Set
+from typing import Dict, List, Optional, Sequence
 
 import torch
 
-from torch_tensorrt.dynamo.lowering import SUBSTITUTION_REGISTRY
 from torch_tensorrt.dynamo._defaults import MIN_BLOCK_SIZE
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner, Partition
 from torch.fx.graph_module import GraphModule
+from .common import DEFAULT_SINGLE_NODE_PARTITIONS
 from torch.fx.node import _get_qualified_name
 from torch.fx.passes.operator_support import OperatorSupport
 
@@ -15,11 +15,6 @@
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_SINGLE_NODE_PARTITIONS: Set[str] = set(
-    _get_qualified_name(to_replace.new_operator)
-    for to_replace in SUBSTITUTION_REGISTRY.values()
-)
-
 
 class TRTPartitioner(CapabilityBasedPartitioner):
     """Partitioner to split an FX graph into subgraphs based on operator support
@@ -193,29 +188,3 @@ def partition(
         supported_ops.print_support_overview(len(partitions))
 
     return fused_graph
-
-
-def get_submod_inputs(
-    mod: torch.fx.GraphModule,
-    submod: torch.fx.GraphModule,
-    inputs: Sequence[torch.Tensor],
-) -> Sequence[torch.Tensor]:
-    """Helper function to get inputs to a Torch submodule
-
-    Args:
-        mod: Parent FX GraphModule
-        submod: Child FX GraphModule
-        inputs: Sample inputs to parent module
-    Returns:
-        Sequence of Tensors representing inputs to child module
-    """
-    acc_inputs = None
-
-    def get_input(self, inputs):
-        nonlocal acc_inputs
-        acc_inputs = inputs
-
-    handle = submod.register_forward_pre_hook(get_input)
-    mod(*inputs)
-    handle.remove()
-    return acc_inputs
@@ -0,0 +1,39 @@
+import torch
+import logging
+from typing import Sequence, Set
+from torch_tensorrt.dynamo.lowering import SUBSTITUTION_REGISTRY
+from torch.fx.node import _get_qualified_name
+
+DEFAULT_SINGLE_NODE_PARTITIONS: Set[str] = set(
+    _get_qualified_name(to_replace.new_operator)
+    for to_replace in SUBSTITUTION_REGISTRY.values()
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_submod_inputs(
+    mod: torch.fx.GraphModule,
+    submod: torch.fx.GraphModule,
+    inputs: Sequence[torch.Tensor],
+) -> Sequence[torch.Tensor]:
+    """Helper function to get inputs to a Torch submodule
+
+    Args:
+        mod: Parent FX GraphModule
+        submod: Child FX GraphModule
+        inputs: Sample inputs to parent module
+    Returns:
+        Sequence of Tensors representing inputs to child module
+    """
+    acc_inputs = None
+
+    def get_input(self, inputs):
+        nonlocal acc_inputs
+        acc_inputs = inputs
+
+    handle = submod.register_forward_pre_hook(get_input)
+    mod(*inputs)
+    handle.remove()
+    return acc_inputs
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,5 @@`
`5`	`5`	`SUBSTITUTION_REGISTRY,`
`6`	`6`	`register_substitution,`
`7`	`7`	`)`
`8`		`-from ._partition import partition, get_submod_inputs, DEFAULT_SINGLE_NODE_PARTITIONS`
`9`	`8`	`from .substitutions import *`
`10`	`9`	`from ._fusers import *`