Fixing issues encountered while trying to to-onnx applecider models. (#565)

drewoldag · web-flow · commit ce90e2519cb1 · 2025-12-18T09:54:13.000-08:00
* Fixing issues encountered while trying to to-onnx applecider models.

* Move set_default_device to a function that is called once.

* Bug fixes in `engine`. Correctly import InferenceDataSetWriter. Fix way we write onnx_results.

* Copy the to_tensor.py file to the onnx output directory.

* Adding initial version of dynamo-based torch onnx export.
diff --git a/src/hyrax/model_exporters.py b/src/hyrax/model_exporters.py
@@ -78,10 +78,21 @@ def export_to_onnx(model, sample, config, ctx):
 
 
 def _export_pytorch_to_onnx(model, sample, output_filepath, opset_version):
-    """Specific implementation to convert PyTorch model to ONNX format. This
-    function will also:
-    -  Run `sample` through the model before converting the model to ONNX
-    -  Convert `sample` to a numpy array
+    """Specific implementation to convert PyTorch model to ONNX format. This uses
+    the older (torch<2.9) export capabilities. And only supports up the opset
+    version 20.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The PyTorch model to be converted to ONNX format.
+    sample : NumPy array or list of NumPy arrays
+        A sample of input data to the model. This is used to trace the model
+        during the export process.
+    output_filepath : Path
+        The file path where the ONNX model will be saved.
+    opset_version : int
+        The ONNX opset version to use for the export.
     """
 
     # deferred import to reduce start up time
@@ -106,19 +117,23 @@ def _export_pytorch_to_onnx(model, sample, output_filepath, opset_version):
     input_names = []
     dynamic_axes = {}
 
-    # ! Currently `sample` is either a tuple or bare numpy array. But after it
-    # ! goes through `default_convert` above, it becomes a list of Tensors.
+    # torch_sample is returned from default_convert as either a single Tensor or
+    # a list of Tensors.
     if isinstance(torch_sample, list):
         for i in range(len(torch_sample)):
             # For supervised models, the label or target should be empty
-            # so we will not include those in the input names.
+            # so we will not include those in the input names. Any labels should
+            # be the last element in the list.
             if len(torch_sample[i]):
                 input_names.append(f"input_{i}")
                 dynamic_axes[f"input_{i}"] = {0: "batch_size"}
     else:
         input_names.append("input")
         dynamic_axes["input"] = {0: "batch_size"}
 
+    # Output is assumed to always have a dynamic batch size.
+    dynamic_axes["output"] = {0: "batch_size"}
+
     # export the model to ONNX format
     export(
         model,
@@ -128,11 +143,73 @@ def _export_pytorch_to_onnx(model, sample, output_filepath, opset_version):
         input_names=input_names,
         output_names=["output"],
         dynamic_axes=dynamic_axes,
+        dynamo=False,  # newer versions of torch will use dynamo by default
     )
 
     # Make sure that the output is on the CPU
     if sample_out.device.type != "cpu":
         sample_out = sample_out.to("cpu")
 
     # Return the output of the model as numpy array
-    return sample_out.numpy()
+    return sample_out.detach().numpy()
+
+
+def _export_pytorch_to_onnx_v2(model, sample, output_filepath, opset_version):
+    """Currently unused.
+    Specific implementation to convert PyTorch model to ONNX format using
+    torch Dynamo export capabilities.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The PyTorch model to be converted to ONNX format.
+    sample : NumPy array or list of NumPy arrays
+        A sample of input data to the model. This is used to trace the model
+        during the export process.
+    output_filepath : Path
+        The file path where the ONNX model will be saved.
+    opset_version : int
+        The ONNX opset version to use for the export.
+    """
+
+    # deferred import to reduce start up time
+    import torch
+    from torch.onnx import export
+    from torch.utils.data.dataloader import default_convert
+
+    # set model in eval mode and move it to the CPU to prep for export to ONNX.
+    model.train(False)
+    model.to("cpu")
+
+    # set the default device to CPU and convert the sample to torch Tensors
+    torch.set_default_device("cpu")
+    torch_sample = default_convert(sample)
+
+    # Run a single sample through the model. We'll check this against the output
+    # from the ONNX version to make sure it's the same, i.e. `np.assert_allclose`.
+    sample_out = model(torch_sample)
+    # Make sure that the output is on the CPU, detached, and as a numpy array
+    sample_out = sample_out.to("cpu").detach().numpy()
+
+    dynamic_shapes = []
+    batch = torch.export.Dim("batch")
+
+    # TODO: This should be built dynamically based on the structure of torch_sample.
+    dynamic_shapes = [[{0: batch}, {0: batch}, {}]]
+
+    export(
+        model,
+        (torch_sample,),  # exporter expects a tuple of inputs for `forward`
+        output_filepath,
+        opset_version=opset_version,
+        dynamo=True,
+        dynamic_shapes=dynamic_shapes,
+        verbose=True,
+        report=True,
+        dump_exported_program=True,
+        artifacts_dir=output_filepath.parent,
+        input_names=["input"],
+        output_names=["output"],
+    )
+
+    return sample_out
diff --git a/src/hyrax/pytorch_ignite.py b/src/hyrax/pytorch_ignite.py
@@ -548,6 +548,7 @@ def create_engine(funcname: str, device: torch.device, model: torch.nn.Module, c
     config : dict
         The runtime config in use
     """
+    torch.set_default_device(device.type)
     return Engine(_create_process_func(funcname, device, model, config))
 
 
diff --git a/src/hyrax/verbs/engine.py b/src/hyrax/verbs/engine.py
@@ -35,7 +35,7 @@ def run(self, model_directory: str = None):
         [x] Implement a simple strategy for reading in batches of data samples
         [x] Process the samples with any custom collate functions as well as a default collate function
         [x] Pass the collated batch to the appropriate to_tensor function
-        [ ] Send that output to the ONNX-ified model
+        [x] Send that output to the ONNX-ified model
         [x] Persist the results of inference.
         """
         from pathlib import Path
@@ -46,7 +46,7 @@ def run(self, model_directory: str = None):
             create_results_dir,
             find_most_recent_results_dir,
         )
-        from hyrax.data_sets.inference_dataset import InferenceDatasetWriter
+        from hyrax.data_sets.inference_dataset import InferenceDataSetWriter
         from hyrax.plugin_utils import load_to_tensor
         from hyrax.pytorch_ignite import setup_dataset
 
@@ -73,7 +73,7 @@ def run(self, model_directory: str = None):
         to_tensor_fn = load_to_tensor(input_directory)
 
         # ~ Load the ONNX model from the input directory.
-        onnx_file_name = input_directory / "model.onnx"
+        onnx_file_name = input_directory / "example_model_opset_20.onnx"
         ort_session = onnxruntime.InferenceSession(onnx_file_name)
 
         # ~ For now we use `setup_dataset` to get our datasets back. Later we can
@@ -94,7 +94,7 @@ def run(self, model_directory: str = None):
         # as a type hint. So we may need to separate InferenceDataset and IDWriter
         # to remove that dependency.
         result_dir = create_results_dir(config, "engine")
-        self.results_writer = InferenceDatasetWriter(infer_dataset, result_dir)
+        self.results_writer = InferenceDataSetWriter(infer_dataset, result_dir)
 
         # Work through the dataset in steps of `batch_size`
         for start_idx in range(0, len(infer_dataset), batch_size):
@@ -108,9 +108,21 @@ def run(self, model_directory: str = None):
             # ~ Pass the collated batch to the to_tensor function
             prepared_batch = to_tensor_fn(collated_batch)
 
-            # Then we would send that output to the ONNX-ified model.
-            ort_inputs = {ort_session.get_inputs()[0].name: prepared_batch}
-            onnx_results = ort_session.run(None, ort_inputs)  # infer with ONNX
+            # Create the inputs array for the ONNX model using the expected inputs
+            # from the loaded ONNX model and the type and shape of the prepared batch.
+            ort_inputs = {}
+            if isinstance(prepared_batch, tuple):
+                for i in range(len(prepared_batch)):
+                    # For a supervised model, we expect that at least one of the
+                    # element in the prepared batch will be empty, so we only
+                    # add non-empty inputs.
+                    if len(prepared_batch[i]):
+                        ort_inputs[ort_session.get_inputs()[i].name] = prepared_batch[i]
+            else:
+                ort_inputs = {ort_session.get_inputs()[0].name: prepared_batch}
+
+            # Run the ONNX model with the prepared batch as input
+            onnx_results = ort_session.run(None, ort_inputs)
 
             # ~ Finally, we persist the results of inference.
             # For now, collated_batch will always have an "object_id" key that
@@ -122,8 +134,10 @@ def run(self, model_directory: str = None):
                 msg += f"Could not determine object IDs from batch. Batch has keys {collated_batch.keys()}"
                 raise RuntimeError(msg)
 
-            # ~ We may not need to do the list comprehension for batch_results, it's
-            # possible that ONNX will already return it in this form.
-            self.results_writer.write_batch(collated_batch["object_id"], [t for t in onnx_results])
+            # Save the output of the onnx model per batch. Onnx results are
+            # returned as a 1-element list containing a numpy array with first
+            # dimension as batch size.
+            self.results_writer.write_batch(collated_batch["object_id"], [i for i in onnx_results[0]])
 
+        # Write the final index file for the inference results.
         self.results_writer.write_index()
diff --git a/src/hyrax/verbs/to_onnx.py b/src/hyrax/verbs/to_onnx.py
@@ -29,6 +29,7 @@ def run_cli(self, args=None):
 
     def run(self, input_model_directory: str = None):
         """Export the model to ONNX format and save it to the specified path."""
+        import shutil
         from pathlib import Path
 
         from hyrax.config_utils import (
@@ -66,6 +67,12 @@ def run(self, input_model_directory: str = None):
         config_manager = ConfigManager(runtime_config_filepath=config_file)
         config_from_training = config_manager.config
 
+        # copy the to_tensor.py file from the input directory to the output directory
+        to_tensor_src = input_directory / "to_tensor.py"
+        to_tensor_dst = output_dir / "to_tensor.py"
+        if to_tensor_src.exists():
+            shutil.copy(to_tensor_src, to_tensor_dst)
+
         # Use the config file to locate and assemble the trained weight file path
         weights_file_path = input_directory / config_from_training["train"]["weights_filename"]
 
diff --git a/tests/hyrax/test_nan.py b/tests/hyrax/test_nan.py
@@ -133,7 +133,7 @@ def test_nan_handling_off_returns_input(loopback_hyrax_nan):
     def to_tensor(data_dict):
         data = data_dict.get("data", {})
         if "image" in data and "label" in data:
-            image = tensor(data["image"])
+            image = data["image"]
             label = data["label"]
             return (image, label)