Completed docs

cehongwang · cehongwang · commit 157b5bae92db · 2024-08-13T14:57:47.000-07:00
diff --git a/docsrc/py_api/torch_tensorrt.rst b/docsrc/py_api/torch_tensorrt.rst
@@ -32,6 +32,9 @@ Functions
 
 Classes
 ---------
+.. autoclass:: MutableTorchTensorRTModule
+   :members:
+   :special-members: __init__
 
 .. autoclass:: Input
    :members:
diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst
@@ -13,4 +13,5 @@ a number of ways you can leverage this backend to accelerate inference.
 * :ref:`torch_export_cudagraphs`: Using the Cudagraphs integration with `ir="dynamo"`
 * :ref:`custom_kernel_plugins`: Creating a plugin to use a custom kernel inside TensorRT engines
 * :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights
+* :ref:`mutable_torchtrt_module_example`: Compile, use, and modify TensorRT Graph Module with MutableTorchTensorRTModule
 * :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile``
diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py
@@ -1,19 +1,19 @@
 """
-.. _refit_engine_example:
+.. _mutable_torchtrt_module_example:
 
-Refit  TenorRT Graph Module with Torch-TensorRT
+Mutable Torch TensorRT Module
 ===================================================================
 
-We are going to demonstrate how a compiled TensorRT Graph Module can be refitted with updated weights.
+We are going to demonstrate how we can easily use Mutable Torch TensorRT Module to compile, interact, and modify the TensorRT Graph Module.
 
-In many cases, we frequently update the weights of models, such as applying various LoRA to Stable Diffusion or constant A/B testing of AI products.
-That poses challenges for TensorRT inference optimizations, as compiling the TensorRT engines takes significant time, making repetitive compilation highly inefficient.
-Torch-TensorRT supports refitting TensorRT graph modules without re-compiling the engine, considerably accelerating the workflow.
+Compiling a Torch-TensorRT module is straightforward, but modifying the compiled module can be challenging, especially when it comes to maintaining the state and connection between the PyTorch module and the corresponding Torch-TensorRT module.
+In Ahead-of-Time (AoT) scenarios, integrating Torch TensorRT with complex pipelines, such as the Hugging Face Stable Diffusion pipeline, becomes even more difficult.
+The Mutable Torch TensorRT Module is designed to address these challenges, making interaction with the Torch-TensorRT module easier than ever.
 
 In this tutorial, we are going to walk through
-1. Compiling a PyTorch model to a TensorRT Graph Module
-2. Save and load a graph module
-3. Refit the graph module
+1. Sample workflow of Mutable Torch TensorRT Module with ResNet 18
+2. Save a Mutable Torch TensorRT Module
+3. Integration with Huggingface pipeline in LoRA use case
 """
 
 import numpy as np
@@ -26,29 +26,86 @@
 inputs = [torch.rand((1, 3, 224, 224)).to("cuda")]
 
 # %%
-# Compile the module for the first time and save it.
+# Initialize the Mutable Torch TensorRT Module with settings.
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-kwargs = {
+settings = {
     "use_python": False,
     "enabled_precisions": {torch.float32},
     "make_refitable": True,
 }
 
 model = models.resnet18(pretrained=False).eval().to("cuda")
-model2 = models.resnet18(pretrained=True).eval().to("cuda")
-mutable_module = torch_trt.MutableTorchTensorRTModule(model, **kwargs)
+mutable_module = torch_trt.MutableTorchTensorRTModule(model, **settings)
+# You can use the mutable module just like the original pytorch module. The compilation happens while you first call the mutable module.
 mutable_module(*inputs)
 
+# %%
+# Make modifications to the mutable module.
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# %%
+# Making changes to mutable module can trigger refit or re-compilation. For example, loading a different state_dict and setting new weight values will trigger refit, and adding a module to the model will trigger re-compilation.
+model2 = models.resnet18(pretrained=True).eval().to("cuda")
 mutable_module.load_state_dict(model2.state_dict())
 
 
 # Check the output
+# The refit happens while you call the mutable module again.
 expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs)
 for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
     assert torch.allclose(
         expected_output, refitted_output, 1e-2, 1e-2
     ), "Refit Result is not correct. Refit failed"
 
 print("Refit successfully!")
+
+# %%
+# Saving Mutable Torch TensorRT Module
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# Currently, saving is only enabled for C++ runtime, not python runtime.
 torch_trt.MutableTorchTensorRTModule.save(mutable_module, "mutable_module.pkl")
 reload = torch_trt.MutableTorchTensorRTModule.load("mutable_module.pkl")
+
+# %%
+# Stable Diffusion with Huggingface
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# The LoRA checkpoint is from https://civitai.com/models/12597/moxin
+
+from diffusers import DiffusionPipeline
+
+with torch.no_grad():
+    kwargs = {
+        "use_python_runtime": True,
+        "enabled_precisions": {torch.float16},
+        "debug": True,
+        "make_refitable": True,
+    }
+
+    model_id = "runwayml/stable-diffusion-v1-5"
+    device = "cuda:0"
+
+    prompt = "portrait of a woman standing, shuimobysim, wuchangshuo, best quality"
+    negative = "(worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, skin spots, acnes, skin blemishes, age spot, glans, (watermark:2),"
+
+    pipe = DiffusionPipeline.from_pretrained(
+        model_id, revision="fp16", torch_dtype=torch.float16
+    )
+    pipe.to(device)
+
+    # The only extra line you need
+    pipe.unet = torch_trt.MutableTorchTensorRTModule(pipe.unet, **kwargs)
+
+    image = pipe(prompt, negative_prompt=negative, num_inference_steps=30).images[0]
+    image.save("./without_LoRA_mutable.jpg")
+
+    # Standard Huggingface LoRA loading procedure
+    pipe.load_lora_weights("./moxin.safetensors", adapter_name="lora1")
+    pipe.set_adapters(["lora1"], adapter_weights=[1])
+    pipe.fuse_lora()
+    pipe.unload_lora_weights()
+
+    # Refit triggered
+    image = pipe(prompt, negative_prompt=negative, num_inference_steps=30).images[0]
+    image.save("./with_LoRA_mutable.jpg")
diff --git a/examples/dynamo/mutable_torchtrt_module_stable_diffusion_example.py b/examples/dynamo/mutable_torchtrt_module_stable_diffusion_example.py
diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
@@ -38,6 +38,17 @@ def get_state(self) -> RefitFlag:
 
 
 class MutableTorchTensorRTModule(object):
+    """
+    Initialize a MutableTorchTensorRTModule to seamlessly manipulate it like a regular PyTorch module.
+    All TensorRT compilation and refitting processes are handled automatically as you work with the module.
+    Any changes to its attributes or loading a different state_dict will trigger refitting or recompilation,
+    which will be managed during the next forward pass.
+
+    The MutableTorchTensorRTModule takes a PyTorch module and a set of configuration settings for the compiler.
+    Once compilation is complete, the module maintains the connection between the TensorRT graph module and the original PyTorch module.
+    Any modifications made to the MutableTorchTensorRTModule will be reflected in both the TensorRT graph module and the original PyTorch module.
+    """
+
     def __init__(
         self,
         pytorch_model: torch.nn.Module,
@@ -75,15 +86,6 @@ def __init__(
         **kwargs: Any,
     ) -> None:
         """
-        Initialize a MutableTorchTensorRTModule. This module can be manipulated just as a normal PyTorch module
-        and all TRT compilation and refit happens underthe hood as the user is using it. Modifying its attribute or
-        loading a different state_dict can trigger refit/recompilation that will be handled in the next forward run.
-
-        MutableTorchTensorRTModule takes a PyTorch module and a set of settings to configure the compiler.
-        After compilation is finished, MutableTorchTensorRTModule maintains the connection between the TRT graph module
-        and the original PyTorch module. And modification to MutableTorchTensorRTModule will reflect in both TRT graph module
-        and original PyTorch module.
-
 
         Arguments:
             pytorch_model (torch.nn.module): Source module that needs to be accelerated
@@ -148,7 +150,6 @@ def __init__(
         assert (
             make_refitable
         ), "'make_refitable' has to be True for a MutableTorchTensorRTModule."
-        make_refitable = True
         compilation_options = {
             "enabled_precisions": (
                 enabled_precisions
@@ -309,6 +310,7 @@ def store_inputs(self, arg_inputs: Any, kwarg_inputs: Any) -> None:
 
     @staticmethod
     def process_kwarg_inputs(inputs: Any) -> Any:
+        # Process kwarg inputs to be acceptable for Torch-TensorRT
         if isinstance(inputs, dict):
             # None should be excluded. AOT compile also does not allow dynamic control flow, bool is also excluded.
             return {
@@ -537,7 +539,7 @@ def load(path: str) -> Any:
 
 
 def recursively_remove_trigger(obj: Any) -> Any:
-    # Not save: If the object has a loop (such as a doubly linkded list), this will cause infinite recursion
+    # Not safe: If the object has a circular reference (such as a doubly linkded list), this will cause infinite recursion
     if obj.__class__.__name__ == "ChangeTriggerWrapper":
         obj = obj.instance