From 443455370eeda8dd4b7033f216ca2f894d0e7c95 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 13 Jan 2022 15:11:56 +0800
Subject: [PATCH 01/22] feat: [collection] make torch_tensorrt::core::ir::Input
 and torch_tensorrt::Input compatible with IValue. Support simple case of
 tuple input model. Add unit test.

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.cpp                           | 17 ++--
 core/compiler.h                             | 10 ++-
 core/conversion/conversion.cpp              |  1 +
 core/ir/StaticParams.cpp                    |  5 +-
 core/ir/ir.cpp                              | 50 +++++++++--
 core/ir/ir.h                                | 24 +++++-
 cpp/include/torch_tensorrt/torch_tensorrt.h | 28 ++++++-
 cpp/src/compile_spec.cpp                    | 66 +++++++++++++++
 cpp/src/torch_tensorrt.cpp                  |  4 +
 tests/cpp/BUILD                             | 20 ++++-
 tests/cpp/test_collection.cpp               | 91 +++++++++++++++++++++
 tests/py/test_collection.py                 | 55 +++++++++++++
 12 files changed, 354 insertions(+), 17 deletions(-)
 create mode 100644 tests/cpp/test_collection.cpp
 create mode 100644 tests/py/test_collection.py

diff --git a/core/compiler.cpp b/core/compiler.cpp
index b684b808f5..45ecd3c993 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -254,6 +254,7 @@ GraphAndMapping ConstructFallbackGraph(
       // update the input ranges for each segments
       convert_cfg.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
 
+      // TODO mapping Inputs Ivalue to flatten one here
       auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, static_params);
       auto temp_g = std::make_shared<torch::jit::Graph>();
       auto device_spec = convert_cfg.engine_settings.device;
@@ -307,11 +308,17 @@ void MapInputsAndDetermineDTypes(
     ir::TypeMap& first_use_type_map) {
   // Associate input specs with inputs
   cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
-
-  for (auto& in : g->inputs()) {
-    if (static_params.find(in) == static_params.end()) {
+  auto tensor_inputs = ir::get_tensor_inputs(g, static_params);
+  LOG_DEBUG("In MapInputsAndDetermineDTypes " << "g->inputs() size " << g->inputs().size() << ", tensor_inputs size " << tensor_inputs.size());
+  // for (auto& in : g->inputs()) {
+  //   if (static_params.find(in) == static_params.end()) {
+    for (auto in : tensor_inputs) {
       ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
-      auto est_type_opt = first_use_type_map.find(in)->second;
+      c10::optional<at::ScalarType> est_type_opt = {};
+      auto est_it = first_use_type_map.find(in);
+      if (est_it != first_use_type_map.end()) {
+        est_type_opt = first_use_type_map.find(in)->second;
+      }
       if (est_type_opt && !spec.dtype_is_user_defined) {
         // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
         // type
@@ -354,7 +361,7 @@ void MapInputsAndDetermineDTypes(
         // The user defined the type so no changes are necessary
       }
     }
-  }
+  // }
 }
 
 uint64_t GetRecommendedWorkspaceSize(const runtime::CudaDevice& device) {
diff --git a/core/compiler.h b/core/compiler.h
index c1bb85aa3b..1743f566b9 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -8,12 +8,20 @@
 #include "core/partitioning/partitioning.h"
 #include "core/runtime/runtime.h"
 #include "torch/csrc/jit/api/module.h"
+#include "torch/csrc/jit/ir/ir.h"
 
 namespace torch_tensorrt {
 namespace core {
 
 struct CompileSpec {
-  CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {}
+  CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {
+    graph_inputs = ir::GraphInputs(inputs);
+  }
+  CompileSpec(torch::jit::IValue& input_signature) {
+    graph_inputs = ir::GraphInputs(input_signature);
+    inputs = graph_inputs.flattened_inputs;
+  }
+  ir::GraphInputs graph_inputs;
   std::vector<ir::Input> inputs;
   conversion::ConversionInfo convert_info;
   lowering::LowerInfo lower_info;
diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 8da79b13a3..56e484e898 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -184,6 +184,7 @@ void AddInputs(
       ctx->input_is_dynamic = true;
     }
 
+    // mapping torch Value to tensorrt iTensor
     ctx->value_tensor_map[in] = trt_in;
     ctx->num_inputs += 1;
   }
diff --git a/core/ir/StaticParams.cpp b/core/ir/StaticParams.cpp
index ac16c72d9f..0fe03e4aff 100644
--- a/core/ir/StaticParams.cpp
+++ b/core/ir/StaticParams.cpp
@@ -11,7 +11,10 @@ StaticParams get_static_params(c10::ArrayRef<torch::jit::Value*> inputs, std::ve
   StaticParams static_params;
   auto param_it = params.begin();
   for (auto in : inputs) {
-    if (in->type() != c10::TensorType::get() && param_it != params.end()) {
+    // handle TensorType, TupleType and ListType
+    if (in->type() != c10::TensorType::get() && 
+        !in->type()->isSubtypeOf(c10::TupleType::create()) &&
+        !in->type()->isSubtypeOf(c10::ListType::ofTensors()) && param_it != params.end()) {
       static_params[in] = *param_it;
       ++param_it;
     }
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index 1c1813ea5f..ce97fa9dbe 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -32,7 +32,9 @@ std::vector<const torch::jit::Value*> get_tensor_inputs(
     StaticParams& static_params) {
   std::vector<const torch::jit::Value*> input_tensors;
   auto inputs = g->inputs();
+  LOG_DEBUG("Inputs size " << inputs.size());
   for (auto in : inputs) {
+    LOG_DEBUG("input debug name: " << in->debugName());
     // Disregarding inputs that are not tensors or are static
     //
     // Ex.
@@ -40,6 +42,27 @@ std::vector<const torch::jit::Value*> get_tensor_inputs(
     // input.1:Tensor -> used
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in);
+    } else if (in->type()->cast<c10::TupleType>() && static_params.find(in) == static_params.end()) {
+    // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) {
+      at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(in);
+      LOG_DEBUG("Tuple size " << unpack_tuple.size());
+      for (auto item: unpack_tuple) {
+        input_tensors.push_back(in);
+      }
+    } else if (in->type()->isSubtypeOf(c10::ListType::ofTensors()) && static_params.find(in) == static_params.end()) {
+      
+      LOG_DEBUG("List use size " << in->uses().size());
+      // for (auto use : in->uses()) {
+      //   LOG_DEBUG(use.user->outputs()[0]->debugName());
+      // }
+      // TODO: set the correct list number according to the Input IValue
+      int n = 2;
+      auto unpack_node = g->createListUnpack(in, n);
+      g->block()->appendNode(unpack_node);
+      for (auto item: unpack_node->outputs()) {
+        input_tensors.push_back(item);
+      }
+      LOG_DEBUG("Unpack List of size " << n);
     }
   }
   return input_tensors;
@@ -52,14 +75,17 @@ c10::optional<at::ScalarType> get_value_first_calc_dtype_opt(torch::jit::Block*
   auto b_ins = b->inputs();
   std::unordered_set<torch::jit::Value*> b_in_set(b_ins.begin(), b_ins.end());
 
-  TORCHTRT_ASSERT(
-      in->type() == c10::TensorType::get(), "Input is not a tensor, cannot check for dtype based on calculation");
+  // TORCHTRT_ASSERT(
+  //     in->type() == c10::TensorType::get(), "Input is not a tensor, cannot check for dtype based on calculation");
 
   auto consumers = in->uses();
   auto search_list = std::vector<torch::jit::Use>(consumers.begin(), consumers.end());
-
-  for (auto iter = search_list.begin(); iter != search_list.end(); ++iter) {
-    auto n = iter->user;
+  LOG_DEBUG("Users number for " << in->debugName() << ": " << consumers.size());
+  while(search_list.size() > 0) {
+    // after insertion, original iterator will be invalid
+    auto& u = search_list.front();
+    search_list.erase(search_list.begin());
+    auto n = u.user;
     LOG_GRAPH("Node we are looking at: " << util::node_info(n));
     auto ins = n->inputs();
     auto outs = n->outputs();
@@ -142,16 +168,28 @@ c10::optional<at::ScalarType> get_value_first_calc_dtype_opt(torch::jit::Block*
 
 TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b) {
   TypeMap types;
-
   for (auto i : b->inputs()) {
     if (i->type() == c10::TensorType::get()) {
       torch::jit::Value* in = i;
       types.insert({in, get_value_first_calc_dtype_opt(b, i)});
+    } else if(i->type()->cast<c10::TupleType>()) {
+      // make sure very time get the same ptr
+      at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
+      LOG_DEBUG("Tuple size " << unpack_tuple.size());
+      for (auto item: unpack_tuple) {
+        torch::jit::Value* in = item;
+        types.insert({in, get_value_first_calc_dtype_opt(b, i)});
+      }
+    } else if(i->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+      LOG_INFO("Unsupported type of c10::ListType::ofTensors()");
     }
   }
   return types;
 }
 
+static auto core_input_container =
+    torch::class_<Input>("_torch_tensorrt_core_ir", "Input").def(torch::init<>());
+
 } // namespace ir
 } // namespace core
 } // namespace torch_tensorrt
diff --git a/core/ir/ir.h b/core/ir/ir.h
index 2d9acccc69..056f257f22 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -11,9 +11,10 @@ namespace torch_tensorrt {
 namespace core {
 namespace ir {
 
-struct Input {
+struct Input : torch::CustomClassHolder {
   // Input(std::vector<int64_t> shape);
   // Input(std::vector<int64_t> min_shape, std::vector<int64_t> opt_shape, std::vector<int64_t> max_shape);
+  Input() {};
   Input(
       std::vector<int64_t> shape,
       nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT,
@@ -36,13 +37,34 @@ struct Input {
   nvinfer1::Dims opt;
   nvinfer1::DataType dtype;
   nvinfer1::TensorFormat format;
+  int id;
 };
 
+// Add to spec
+struct GraphInputs {
+  GraphInputs() {}
+  GraphInputs(torch::jit::IValue inputs) {
+    input_signature = inputs;
+    // TODO flatten IValue
+  }
+  GraphInputs(std::vector<Input> inputs) {
+    flattened_inputs = inputs;
+    // TODO construct the IValue
+  }
+  torch::jit::IValue input_signature;  // nested Input, full input spec
+  std::vector<Input> flattened_inputs;  // flattend Input
+};
+
+typedef std::pair<GraphInputs, torch::jit::IValue> GraphIO; // Graph input output mapping
+
 using StaticParams = std::map<torch::jit::Value*, torch::jit::IValue>;
 StaticParams get_static_params(c10::ArrayRef<torch::jit::Value*> inputs, std::vector<torch::jit::IValue> params);
 
 using InputSpecMap = std::unordered_map<const torch::jit::Value*, Input>;
 
+std::vector<const torch::jit::Value*> get_tensor_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    StaticParams& static_params);
 InputSpecMap associate_specs_with_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     std::vector<Input> specs,
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
index ace05d33f5..63dc96e654 100644
--- a/cpp/include/torch_tensorrt/torch_tensorrt.h
+++ b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -14,6 +14,7 @@
 #include <set>
 #include <string>
 #include <vector>
+#include "torch/custom_class.h"
 
 // Just include the .h?
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
@@ -363,7 +364,7 @@ class TORCHTRT_API TensorFormat {
  * signifying a static input shape or a set of three input shapes representing
  * the min, optiminal and max input shapes allowed for the engine.
  */
-struct TORCHTRT_API Input {
+struct TORCHTRT_API Input : torch::CustomClassHolder{
   /// Minimum acceptable input size into the engine
   std::vector<int64_t> min_shape;
   /// Optimal input size into the engine (size optimized for given kernels accept any size in min max range)
@@ -378,6 +379,7 @@ struct TORCHTRT_API Input {
   /// Expected tensor format for the input
   TensorFormat format;
 
+  Input() {}
   /**
    * @brief Construct a new Input spec object for static input size from
    * vector, optional arguments allow the user to configure expected input shape
@@ -512,6 +514,16 @@ struct TORCHTRT_API Input {
   bool input_is_dynamic;
 };
 
+/**
+ * @brief A struct to hold complex inputs
+ *
+ * This struct can either hold a conplex inputs of shape or a flattened one,
+ */
+struct TORCHTRT_API GraphInputs {
+  torch::jit::IValue input_signature;   // nested Input, full input spec
+  std::vector<Input> flattened_inputs;  // flattend Input
+};
+
 /**
  * @brief Get the build information for the library including the dependency
  * versions
@@ -581,6 +593,15 @@ struct TORCHTRT_API CompileSpec {
    */
   CompileSpec(std::vector<Input> inputs) : inputs(std::move(inputs)) {}
 
+  /**
+   * @brief Construct a new Extra Info object from IValue.
+   * The IValue store a complex Input
+   *
+   * @param inputs
+   */
+  CompileSpec(torch::jit::IValue input_signature) {
+    graph_inputs.input_signature = input_signature;
+  }
   // Defaults should reflect TensorRT defaults for BuilderConfig
 
   /**
@@ -591,6 +612,11 @@ struct TORCHTRT_API CompileSpec {
    */
   std::vector<Input> inputs;
 
+  /**
+   * @brief Specifications for inputs to the engine, can store a IValue which has stored complex Input
+   *  or a flatened Input
+   */
+  GraphInputs graph_inputs;
   /**
    * @brief The set of precisions TensorRT is allowed to use for kernels during compilation
    *
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 3058b23ce0..74de9e4801 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -20,16 +20,82 @@ CompileSpec::CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes) {
   for (auto in : fixed_sizes) {
     inputs.push_back(Input(in));
   }
+  graph_inputs.flattened_inputs = inputs;
 }
 
 CompileSpec::CompileSpec(std::vector<std::vector<int64_t>> fixed_sizes) {
   for (auto in : fixed_sizes) {
     inputs.push_back(Input(in));
   }
+  graph_inputs.flattened_inputs = inputs;
+}
+
+void flatten_dfs(std::vector<torchtrt::core::ir::Input>& flattened_inputs, torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) {
+    if (input_ivalue.isTuple()) {
+      auto input_tuple = input_ivalue.toTuple();
+      std::vector<torch::jit::IValue> converted_elements;
+      for (auto item: input_tuple->elements()) {
+        torch::jit::IValue converted_item;
+        flatten_dfs(flattened_inputs, item, converted_item);
+        converted_elements.push_back(converted_item);
+        auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
+        converted_ivalue = torch::jit::IValue(tuple_ptr);
+      }
+    } else if(input_ivalue.isList()) {
+      auto input_list = input_ivalue.toList().vec();
+      c10::TypePtr type = input_list[0].type();
+      auto converted_elements = c10::impl::GenericList(type);
+      // std::vector<torch::jit::IValue> converted_elements;
+      for (auto item: input_list) {
+        torch::jit::IValue converted_item;
+        flatten_dfs(flattened_inputs, item, converted_item);
+        converted_elements.push_back(converted_item);
+      }
+      converted_ivalue = torch::jit::IValue(converted_elements);
+    } else if(input_ivalue.isCustomClass()) {
+      torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass<torchtrt::Input>()));
+      flattened_inputs.push_back(cur_input);
+      converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::core::ir::Input>(cur_input)));
+    }
+}
+
+torch_tensorrt::core::ir::GraphInputs to_internal_graph_inputs(GraphInputs external_graph_input) {
+  torch_tensorrt::core::ir::GraphInputs internal_graph_input;
+
+  // flattened version
+  if (external_graph_input.flattened_inputs.size() > 0) {
+    // std::vector<torch::jit::IValue> input_shape_list;
+    auto empty_ivalue = torch::jit::IValue(c10::make_intrusive<torchtrt::core::ir::Input>(torchtrt::core::ir::Input()));
+    c10::TypePtr type = empty_ivalue.type();
+    auto input_shape_list = c10::impl::GenericList(type);
+    std::vector<torchtrt::core::ir::Input> internal_input = to_vec_internal_inputs(external_graph_input.flattened_inputs);
+    for (auto input_shape: internal_input) {
+      auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torchtrt::core::ir::Input>(input_shape)));
+      input_shape_list.push_back(input_shape_ivalue);
+    }
+
+    torch::jit::IValue input_signature(input_shape_list);
+    internal_graph_input.flattened_inputs = internal_input;
+    internal_graph_input.input_signature = input_signature;
+    
+  }
+  // nested version
+  else {
+    std::vector<torchtrt::core::ir::Input> flattened_inputs;
+    torch::jit::IValue input_signature;
+    flatten_dfs(flattened_inputs, external_graph_input.input_signature, input_signature);
+    internal_graph_input.flattened_inputs = flattened_inputs;
+    internal_graph_input.input_signature = input_signature;
+    printf("in nested version branch\n");
+
+  }
+  return internal_graph_input;
 }
 
 torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
   torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.inputs));
+  internal.graph_inputs = to_internal_graph_inputs(external.graph_inputs);
+  internal.inputs = internal.graph_inputs.flattened_inputs;
 
   for (auto p : external.enabled_precisions) {
     internal.convert_info.engine_settings.enabled_precisions.insert(toTRTDataType(p));
diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp
index 42b44833de..9d6f271332 100644
--- a/cpp/src/torch_tensorrt.cpp
+++ b/cpp/src/torch_tensorrt.cpp
@@ -30,6 +30,7 @@ torch::jit::script::Module compile(const torch::jit::script::Module& module, Com
   LOG_DEBUG(get_build_info());
   // Want to export a much simpler (non TRT header dependent) API so doing the
   // type conversion here
+  printf("in torch_tensorrt::ts::compile\n");
   return torch_tensorrt::core::CompileGraph(module, to_internal_compile_spec(info));
 }
 
@@ -52,4 +53,7 @@ void set_device(const int gpu_id) {
   // Want to export a much simpler (non CUDA header dependent) API
   torch_tensorrt::core::set_device(gpu_id);
 }
+
+static auto tensorrt_input_container =
+    torch::class_<Input>("_torch_tensorrt", "Input").def(torch::init<>());
 } // namespace torch_tensorrt
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
index 3d69afba95..2d545dc8f1 100644
--- a/tests/cpp/BUILD
+++ b/tests/cpp/BUILD
@@ -18,7 +18,8 @@ test_suite(
         ":test_multiple_registered_engines",
         ":test_serialization",
         ":test_module_fallback",
-        ":test_example_tensors"
+        ":test_example_tensors",
+        ":test_collection"
     ],
 )
 
@@ -32,7 +33,8 @@ test_suite(
         ":test_multiple_registered_engines",
         ":test_serialization",
         ":test_module_fallback",
-        ":test_example_tensors"
+        ":test_example_tensors",
+        ":test_collection"
     ],
 )
 
@@ -122,6 +124,20 @@ cc_test(
     })
 )
 
+cc_test(
+    name = "test_collection",
+    srcs = ["test_collection.cpp"],
+    data = [
+        "//tests/modules:jit_models",
+    ],
+    deps = [
+        "//tests/util",
+        "@googletest//:gtest_main",
+    ] + select({
+        ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
+        "//conditions:default": ["@libtorch//:libtorch"],
+    })
+)
 cc_test(
     name = "test_compiled_modules",
     srcs = ["test_compiled_modules.cpp"],
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
new file mode 100644
index 0000000000..a48e642a1c
--- /dev/null
+++ b/tests/cpp/test_collection.cpp
@@ -0,0 +1,91 @@
+#include <string>
+#include <thread>
+#include "gtest/gtest.h"
+#include "tests/util/util.h"
+#include "torch/script.h"
+#include "torch_tensorrt/torch_tensorrt.h"
+
+
+TEST(CppAPITests, TestCollection) {
+
+
+  std::string path =
+  // "/opt/trtorch/tuple2model.ts";
+  // "/opt/trtorch/tuple2_list2_v3.ts";
+  // "/opt/trtorch/tuple2_tuple2_v3.ts";
+  "/opt/trtorch/tuple2_v3.ts";
+  // "/opt/trtorch/list2_list2_v3.ts";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+  
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+
+  std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
+  std::vector<torch::jit::IValue> tuple;
+  std::tuple<torch::jit::IValue, torch::jit::IValue> input_tuple(in0, in0);
+  // auto input_list = c10::impl::GenericList(c10::TensorType::get());
+  // input_list.push_back(inputs_[0]);
+  // input_list.push_back(inputs_[0]);
+
+  // torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
+
+  complex_inputs.push_back(input_tuple);
+  complex_inputs_list.push_back(in0);
+  complex_inputs_list.push_back(in0);
+
+
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+
+  c10::TypePtr elementType = input_shape_ivalue.type();
+  auto list = c10::impl::GenericList(elementType);
+  list.push_back(input_shape_ivalue);
+  list.push_back(input_shape_ivalue);
+
+  std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
+
+
+  torch::jit::IValue complex_input_shape(input_shape_tuple);
+  // torch::jit::IValue complex_input_shape(list);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape);
+  compile_settings.require_full_compilation = false;
+  // compile_settings.torch_executed_modules.push_back("model1");
+  // compile_settings.torch_executed_ops.push_back("aten::sub");
+
+
+  // // FP16 execution
+  // compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  // auto trt_out = trt_mod.forward(complex_inputs);
+  auto trt_out = trt_mod.forward(complex_inputs_list);
+
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+}
\ No newline at end of file
diff --git a/tests/py/test_collection.py b/tests/py/test_collection.py
new file mode 100644
index 0000000000..610bf9fe9c
--- /dev/null
+++ b/tests/py/test_collection.py
@@ -0,0 +1,55 @@
+import torch
+import copy
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, List, Dict
+
+class Model1(nn.Module):
+    def __init__(self):
+        super(Model1, self).__init__()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r = z[1] + z[0]
+        return r, z[1]
+
+
+class TestModel1(nn.Module):
+    def __init__(self):
+        super(TestModel, self).__init__()
+        self.model1 = Model1()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r2, r1  = self.model1((z[0], z[1]))
+        # unsupport ops
+        i = r2.size(1)
+        j = r2.size(2)
+#         r3 = torch.tensor(i) * torch.tensor(j)
+        r3 = r2[0,0,0,0]
+        k = int(r3) - 5
+
+#         if k > 0:
+        r = r1 - k
+        result = (r, r1)
+#         else:
+#             r = r1 - k
+#             result = (r1, r)
+        return result
+
+class TestModel(nn.Module):
+    def __init__(self):
+        super(TestModel, self).__init__()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r = z[0] + z[1]
+        return r
+
+test_model = TestModel()
+
+ts = torch.jit.script(test_model)
+print(ts.graph)
+
+ts.to("cuda").eval()
+input_data = torch.randn((16, 3, 32, 32))
+input_data = input_data.float().to("cuda")
+result = ts((input_data, input_data))
+torch.jit.save(ts, "./tuple2_v3.ts")
\ No newline at end of file

From 2fc1363dd0de1a264de97607d4a5209f79261b96 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 17 Feb 2022 22:46:41 +0800
Subject: [PATCH 02/22] feat: [collection] try to defer determing the data type
 of tuple/list elements. Using two level vector to store ir::Input

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.cpp             | 107 +++++++++++++++++-------------
 core/compiler.h               |   2 +-
 core/conversion/conversion.h  |   1 +
 core/ir/ir.cpp                | 120 +++++++++++++++++++++++++++++-----
 core/ir/ir.h                  |  13 +++-
 core/lowering/lowering.cpp    |   2 +-
 tests/cpp/test_collection.cpp |   4 +-
 7 files changed, 180 insertions(+), 69 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 45ecd3c993..5b811ca20b 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -305,60 +305,71 @@ void MapInputsAndDetermineDTypes(
     CompileSpec& cfg,
     std::shared_ptr<torch::jit::Graph>& g,
     ir::StaticParams& static_params,
-    ir::TypeMap& first_use_type_map) {
+    ir::CollectionTypeMap& first_use_type_map) {
+    // ir::TypeMap& first_use_type_map) {
   // Associate input specs with inputs
-  cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
-  auto tensor_inputs = ir::get_tensor_inputs(g, static_params);
-  LOG_DEBUG("In MapInputsAndDetermineDTypes " << "g->inputs() size " << g->inputs().size() << ", tensor_inputs size " << tensor_inputs.size());
+  // cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
+  cfg.convert_info.collection_inputs = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
+
+  auto collection_inputs = ir::get_collection_inputs(g, static_params);
+  LOG_DEBUG("In MapInputsAndDetermineDTypes " << "g->inputs() size " << g->inputs().size() << ", collection_inputs size " << collection_inputs.size());
   // for (auto& in : g->inputs()) {
   //   if (static_params.find(in) == static_params.end()) {
-    for (auto in : tensor_inputs) {
-      ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
-      c10::optional<at::ScalarType> est_type_opt = {};
+    for (auto in : collection_inputs) {
+      std::vector<ir::Input>& spec = cfg.convert_info.collection_inputs.find(in)->second;
+      // ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
+      // c10::optional<at::ScalarType> est_type_opt = {};
+      std::vector<c10::optional<at::ScalarType>> est_type_opt;
+      
       auto est_it = first_use_type_map.find(in);
       if (est_it != first_use_type_map.end()) {
         est_type_opt = first_use_type_map.find(in)->second;
       }
-      if (est_type_opt && !spec.dtype_is_user_defined) {
-        // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
-        // type
-        LOG_INFO(
-            "Since input type is not explicitly defined, infering using first tensor calculation\n  Found input "
-            << in->debugName() << " has type " << est_type_opt.value()
-            << ". If this is incorrect explicitly set dtype for input and file a bug");
-        spec.dtype = util::ScalarTypeToTRTDataType(est_type_opt.value());
-      } else if (!est_type_opt && !spec.dtype_is_user_defined) {
-        // If we cannot calculate the type and the user did not define the type, then default to FP32
-        LOG_WARNING(
-            "Cannot infer input type from calcuations in graph for input "
-            << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
-        spec.dtype = nvinfer1::DataType::kFLOAT;
-      } else if (spec.dtype_is_user_defined && cfg.partition_info.enabled) {
-        if (!est_type_opt) {
-          LOG_INFO("Cannot infer input tensor dtype in graph. Using user provided input dtype settings");
-          first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
-        } else {
-          if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
-            std::stringstream ss;
-            ss << "For input " << in->debugName() << ", found user specified input dtype as ";
-            ss << cfg.convert_info.inputs.find(in)->second.dtype;
-            ss << ", however when inspecting the graph, the input type expected was inferred to be ";
-            ss << est_type_opt.value() << std::endl;
-            ss << "The compiler is going to use the user setting " << cfg.convert_info.inputs.find(in)->second.dtype;
-            ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
-            ss << "compatibility with PyTorch's data type convention is required.\n";
-            ss << "If you do indeed see errors at runtime either:\n";
-            ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
-            ss << "- Disable partial compilation by setting require_full_compilation to True";
-            auto warn_str = ss.str();
-            LOG_WARNING(warn_str);
+      // traverse elements in est_type_out and spec
+      for (int i = 0; i < est_type_opt.size(); i++) {
+        if (est_type_opt[i] && !spec[i].dtype_is_user_defined) {
+          // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
+          // type
+          LOG_INFO(
+              "Since input type is not explicitly defined, infering using first tensor calculation\n  Found input "
+              << in->debugName() << " has type " << est_type_opt[i].value()
+              << ". If this is incorrect explicitly set dtype for input and file a bug");
+          spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value());
+        } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) {
+          // If we cannot calculate the type and the user did not define the type, then default to FP32
+          LOG_WARNING(
+              "Cannot infer input type from calcuations in graph for input "
+              << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
+          spec[i].dtype = nvinfer1::DataType::kFLOAT;
+        } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) {
+          if (!est_type_opt[i]) {
+            LOG_INFO("Cannot infer input tensor dtype in graph, unable to verify user input dtype settings");
+          } else {
+            // if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
+              if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype) != est_type_opt[i].value()) {
+              std::stringstream ss;
+              ss << "For input " << in->debugName() << ", found user specified input dtype as ";
+              ss << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
+              // ss << cfg.convert_info.inputs.find(in)->second.dtype;
+              ss << ", however when inspecting the graph, the input type expected was inferred to be ";
+              ss << est_type_opt[i].value() << std::endl;
+              // ss << "The compiler is going to use the user setting " << cfg.convert_info.inputs.find(in)->second.dtype;
+              ss << "The compiler is going to use the user setting " << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
+              ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
+              ss << "compatibility with PyTorch's data type convention is required.\n";
+              ss << "If you do indeed see errors at runtime either:\n";
+              ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
+              ss << "- Disable partial compilation by setting require_full_compilation to True";
+              auto warn_str = ss.str();
+              LOG_WARNING(warn_str);
+              // Overwrite type map with user settings
+              // first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
+              first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype)};
+            }
           }
-          // Overwrite type map with user settings
-          // We use this map for partitiioning since we need c10::ScalarTypes not nvinfer::DataTypes
-          first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
+        } else {
+          // The user defined the type so no changes are necessary
         }
-      } else {
-        // The user defined the type so no changes are necessary
       }
     }
   // }
@@ -383,7 +394,8 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
   auto params = graph_and_parameters.second;
   auto static_params = ir::get_static_params(g->inputs(), params);
   // Infer the type of an input from the weights of the calculation
-  auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
+  // auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
+  auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
   // GPU default WS size : 1 GB
   // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
@@ -423,7 +435,8 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       auto params = graph_and_parameters.second;
       auto static_params = ir::get_static_params(g->inputs(), params);
       // Infer the type of an input from the weights of the calculation
-      auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
+      // auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
+      auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
       MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
       auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
diff --git a/core/compiler.h b/core/compiler.h
index 1743f566b9..16cb17148d 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -22,7 +22,7 @@ struct CompileSpec {
     inputs = graph_inputs.flattened_inputs;
   }
   ir::GraphInputs graph_inputs;
-  std::vector<ir::Input> inputs;
+  std::vector<ir::Input> inputs; // can be replaced by graph_inputs
   conversion::ConversionInfo convert_info;
   lowering::LowerInfo lower_info;
   partitioning::PartitionInfo partition_info;
diff --git a/core/conversion/conversion.h b/core/conversion/conversion.h
index 58c06b42a3..ba194716e8 100644
--- a/core/conversion/conversion.h
+++ b/core/conversion/conversion.h
@@ -13,6 +13,7 @@ namespace conversion {
 
 struct ConversionInfo {
   ir::InputSpecMap inputs;
+  ir::CollectionInputSpecMap collection_inputs;
   BuilderSettings engine_settings;
 };
 
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index ce97fa9dbe..93d3a16f2d 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -13,6 +13,14 @@ InputSpecMap associate_specs_with_inputs(
   return pair_input_vals_with_specs(tensor_inputs, specs);
 }
 
+CollectionInputSpecMap associate_specs_with_collection_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    ir::GraphInputs graph_inputs,
+    StaticParams& static_params) {
+  auto tensor_inputs = get_collection_inputs(g, static_params);
+  return pair_input_vals_with_specs(tensor_inputs, graph_inputs.collection_inputs);
+}
+
 InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<Input> specs) {
   TORCHTRT_CHECK(
       vals.size() == specs.size(),
@@ -27,6 +35,20 @@ InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> va
   return a;
 }
 
+CollectionInputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs) {
+  TORCHTRT_CHECK(
+      vals.size() == specs.size(),
+      "Expected dimension specifications for all input tensors"
+          << ", but found " << vals.size() << " input tensors and " << specs.size() << " dimension specs");
+
+  CollectionInputSpecMap a;
+  for (size_t i = 0; i < vals.size(); i++) {
+    LOG_DEBUG("Paring " << i << ": " << vals[i]->debugName() << " : " << specs[i]);
+    a.insert({vals[i], specs[i]});
+  }
+  return a;
+}
+
 std::vector<const torch::jit::Value*> get_tensor_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     StaticParams& static_params) {
@@ -42,27 +64,59 @@ std::vector<const torch::jit::Value*> get_tensor_inputs(
     // input.1:Tensor -> used
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in);
-    } else if (in->type()->cast<c10::TupleType>() && static_params.find(in) == static_params.end()) {
+    } 
+    // else if (in->type()->cast<c10::TupleType>() && static_params.find(in) == static_params.end()) {
+    // // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) {
+    //   at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(in);
+    //   LOG_DEBUG("Tuple size " << unpack_tuple.size());
+    //   for (auto item: unpack_tuple) {
+    //     input_tensors.push_back(in);
+    //   }
+    // } else if (in->type()->isSubtypeOf(c10::ListType::ofTensors()) && static_params.find(in) == static_params.end()) {
+      
+    //   LOG_DEBUG("List use size " << in->uses().size());
+    //   // for (auto use : in->uses()) {
+    //   //   LOG_DEBUG(use.user->outputs()[0]->debugName());
+    //   // }
+    //   // TODO: set the correct list number according to the Input IValue
+    //   int n = 2;
+    //   auto unpack_node = g->createListUnpack(in, n);
+    //   g->block()->appendNode(unpack_node);
+    //   for (auto item: unpack_node->outputs()) {
+    //     input_tensors.push_back(item);
+    //   }
+    //   LOG_DEBUG("Unpack List of size " << n);
+    // }
+  }
+  return input_tensors;
+}
+
+std::vector<const torch::jit::Value*> get_collection_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    StaticParams& static_params) {
+  std::vector<const torch::jit::Value*> input_tensors;
+  auto inputs = g->inputs();
+  LOG_DEBUG("Inputs size " << inputs.size());
+  for (auto in : inputs) {
+    LOG_DEBUG("input debug name: " << in->debugName());
+    // Disregarding inputs that are not tensors or are static
+    //
+    // Ex.
+    // self.1:__torch__.alexnet -> ignored
+    // input.1:Tensor -> used
+    if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
+      input_tensors.push_back(in);
+    } else if (in->type()->kind() == torch::jit::TypeKind::TupleType && static_params.find(in) == static_params.end()) {
     // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) {
+      input_tensors.push_back(in); // push original tuple
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(in);
       LOG_DEBUG("Tuple size " << unpack_tuple.size());
-      for (auto item: unpack_tuple) {
-        input_tensors.push_back(in);
-      }
-    } else if (in->type()->isSubtypeOf(c10::ListType::ofTensors()) && static_params.find(in) == static_params.end()) {
-      
-      LOG_DEBUG("List use size " << in->uses().size());
-      // for (auto use : in->uses()) {
-      //   LOG_DEBUG(use.user->outputs()[0]->debugName());
+      // for (auto item: unpack_tuple) {
+      //   input_tensors.push_back(in);
       // }
-      // TODO: set the correct list number according to the Input IValue
-      int n = 2;
-      auto unpack_node = g->createListUnpack(in, n);
-      g->block()->appendNode(unpack_node);
-      for (auto item: unpack_node->outputs()) {
-        input_tensors.push_back(item);
-      }
-      LOG_DEBUG("Unpack List of size " << n);
+    } else if (in->type()->kind() == torch::jit::TypeKind::ListType && static_params.find(in) == static_params.end()) {
+      LOG_DEBUG("List use size " << in->uses().size());
+      input_tensors.push_back(in); // push original list
     }
   }
   return input_tensors;
@@ -187,6 +241,38 @@ TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b) {
   return types;
 }
 
+CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* b) {
+  CollectionTypeMap types;
+  for (auto i : b->inputs()) {
+    if (i->type() == c10::TensorType::get()) {
+      torch::jit::Value* in = i;
+      types.insert({in, {get_value_first_calc_dtype_opt(b, i)}});
+    } else if(i->type()->kind() == torch::jit::TypeKind::TupleType) {
+      LOG_DEBUG("get_block_first_calc_dtypes_opt TupleType");
+
+      
+      // TODO: how to evaluate the data type of tuple element
+      // make sure very time get the same ptr
+      at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
+      LOG_DEBUG("get_block_first_calc_dtypes_opt: tuple size " << unpack_tuple.size());
+      std::vector<c10::optional<at::ScalarType>> empty_dytpes(unpack_tuple.size());
+      types.insert({i, empty_dytpes}); // insert an empty 
+      // for (auto item: unpack_tuple) {
+      //   torch::jit::Value* in = item;
+      //   types.insert({in, get_value_first_calc_dtype_opt(b, i)});
+      // }
+
+    } else if(i->type()->kind() == torch::jit::TypeKind::ListType) {
+      // TODO: how to evaluate the data type of tuple element
+      LOG_DEBUG("get_block_first_calc_dtypes_opt ListType");
+      types.insert({i, {}}); // insert an empty 
+      // LOG_INFO("Unsupported type of c10::ListType::ofTensors()");
+
+    }
+  }
+  return types;
+}
+
 static auto core_input_container =
     torch::class_<Input>("_torch_tensorrt_core_ir", "Input").def(torch::init<>());
 
diff --git a/core/ir/ir.h b/core/ir/ir.h
index 056f257f22..54e61a44a4 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -53,6 +53,7 @@ struct GraphInputs {
   }
   torch::jit::IValue input_signature;  // nested Input, full input spec
   std::vector<Input> flattened_inputs;  // flattend Input
+  std::vector<std::vector<Input>> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e)
 };
 
 typedef std::pair<GraphInputs, torch::jit::IValue> GraphIO; // Graph input output mapping
@@ -61,6 +62,7 @@ using StaticParams = std::map<torch::jit::Value*, torch::jit::IValue>;
 StaticParams get_static_params(c10::ArrayRef<torch::jit::Value*> inputs, std::vector<torch::jit::IValue> params);
 
 using InputSpecMap = std::unordered_map<const torch::jit::Value*, Input>;
+using CollectionInputSpecMap = std::unordered_map<const torch::jit::Value*, std::vector<Input>>;
 
 std::vector<const torch::jit::Value*> get_tensor_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
@@ -69,16 +71,25 @@ InputSpecMap associate_specs_with_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     std::vector<Input> specs,
     StaticParams& static_params);
+CollectionInputSpecMap associate_specs_with_collection_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    ir::GraphInputs graph_inputs,
+    StaticParams& static_params);
 InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<Input> specs);
+CollectionInputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs);
 std::vector<const torch::jit::Value*> get_tensor_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     StaticParams& static_params);
+std::vector<const torch::jit::Value*> get_collection_inputs(
+    std::shared_ptr<torch::jit::Graph>& g,
+    StaticParams& static_params);
 
 using TypeMap = std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>;
+using CollectionTypeMap = std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>;
 
 c10::optional<at::ScalarType> get_value_first_calc_dtype_opt(torch::jit::Block* b, torch::jit::Value* in);
 ir::TypeMap get_block_first_calc_dtypes_opt(torch::jit::Block* b);
-
+ir::CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block* b);
 } // namespace ir
 } // namespace core
 } // namespace torch_tensorrt
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
index d3296c347c..0051ad451c 100644
--- a/core/lowering/lowering.cpp
+++ b/core/lowering/lowering.cpp
@@ -33,7 +33,7 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   torch::jit::InlineFunctionalGraphs(g);
   torch::jit::PeepholeOptimize(g, false);
   torch::jit::FuseLinear(g);
-  torch::jit::LowerAllTuples(g);
+  // torch::jit::LowerAllTuples(g);
   if (!lower_info.disable_cse) {
     torch::jit::EliminateCommonSubexpression(g);
   }
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index a48e642a1c..019dd6c7f1 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -83,8 +83,8 @@ TEST(CppAPITests, TestCollection) {
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
-  // auto trt_out = trt_mod.forward(complex_inputs);
-  auto trt_out = trt_mod.forward(complex_inputs_list);
+  auto trt_out = trt_mod.forward(complex_inputs);
+  // auto trt_out = trt_mod.forward(complex_inputs_list);
 
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));

From 0072e37e9c2c5e52780383602f022a486f7b03e4 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 10 Mar 2022 17:09:26 +0800
Subject: [PATCH 03/22] feat: [collection] limited support for tuple input

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.cpp                           |   6 +-
 core/compiler.h                             |   7 +-
 core/ir/StaticParams.cpp                    |   4 +-
 core/ir/ir.cpp                              |  26 +++--
 core/ir/ir.h                                |  22 ++---
 core/partitioning/shape_analysis.cpp        | 103 +++++++++++++++++---
 core/partitioning/shape_analysis.h          |   8 +-
 cpp/include/torch_tensorrt/torch_tensorrt.h |   6 +-
 cpp/src/compile_spec.cpp                    | 101 +++++++++++++------
 tests/cpp/test_collection.cpp               |   6 +-
 10 files changed, 204 insertions(+), 85 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 5b811ca20b..a431bcdae3 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -449,8 +449,9 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       if (cfg.partition_info.enabled &&
           !(cfg.lower_info.forced_fallback_modules.size() == 0 &&
             cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
-        auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types);
-        auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params);
+
+        auto collection_input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.collection_inputs, first_use_types);
+        auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), collection_input_ivalues_map, cfg, static_params);
         new_g = graph_and_mapping.first;
         LOG_INFO("Segmented Graph: " << *new_g);
 
@@ -464,6 +465,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
         TORCHTRT_CHECK(
             conversion::VerifyConverterSupportForBlock(g->block()),
             "Not all operations in graph are supported by the compiler");
+        // TODO find the right
         auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params);
         AddEngineToGraph(new_mod, new_g, engine, cuda_device);
       }
diff --git a/core/compiler.h b/core/compiler.h
index 16cb17148d..71aa8899b2 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -15,11 +15,12 @@ namespace core {
 
 struct CompileSpec {
   CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {
-    graph_inputs = ir::GraphInputs(inputs);
+    // graph_inputs = ir::GraphInputs(inputs);
   }
   CompileSpec(torch::jit::IValue& input_signature) {
-    graph_inputs = ir::GraphInputs(input_signature);
-    inputs = graph_inputs.flattened_inputs;
+    // graph_inputs = ir::GraphInputs(input_signature);
+    // inputs = graph_inputs.flattened_inputs;
+    graph_inputs.input_signature = input_signature;
   }
   ir::GraphInputs graph_inputs;
   std::vector<ir::Input> inputs; // can be replaced by graph_inputs
diff --git a/core/ir/StaticParams.cpp b/core/ir/StaticParams.cpp
index 0fe03e4aff..0073ad2888 100644
--- a/core/ir/StaticParams.cpp
+++ b/core/ir/StaticParams.cpp
@@ -13,8 +13,8 @@ StaticParams get_static_params(c10::ArrayRef<torch::jit::Value*> inputs, std::ve
   for (auto in : inputs) {
     // handle TensorType, TupleType and ListType
     if (in->type() != c10::TensorType::get() && 
-        !in->type()->isSubtypeOf(c10::TupleType::create()) &&
-        !in->type()->isSubtypeOf(c10::ListType::ofTensors()) && param_it != params.end()) {
+        in->type()->kind() != torch::jit::TypeKind::TupleType &&
+        in->type()->kind() != torch::jit::TypeKind::ListType && param_it != params.end()) {
       static_params[in] = *param_it;
       ++param_it;
     }
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index 93d3a16f2d..a1a49ba3ca 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -18,10 +18,11 @@ CollectionInputSpecMap associate_specs_with_collection_inputs(
     ir::GraphInputs graph_inputs,
     StaticParams& static_params) {
   auto tensor_inputs = get_collection_inputs(g, static_params);
-  return pair_input_vals_with_specs(tensor_inputs, graph_inputs.collection_inputs);
+  return pair_input_vals_with_specs_collection(tensor_inputs, graph_inputs.collection_inputs);
 }
 
 InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<Input> specs) {
+  LOG_DEBUG("pair_input_vals_with_specs");
   TORCHTRT_CHECK(
       vals.size() == specs.size(),
       "Expected dimension specifications for all input tensors"
@@ -35,7 +36,8 @@ InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> va
   return a;
 }
 
-CollectionInputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs) {
+CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs) {
+  LOG_DEBUG("pair_input_vals_with_specs collection");
   TORCHTRT_CHECK(
       vals.size() == specs.size(),
       "Expected dimension specifications for all input tensors"
@@ -96,7 +98,7 @@ std::vector<const torch::jit::Value*> get_collection_inputs(
     StaticParams& static_params) {
   std::vector<const torch::jit::Value*> input_tensors;
   auto inputs = g->inputs();
-  LOG_DEBUG("Inputs size " << inputs.size());
+  LOG_DEBUG("get_collection_inputs, inputs size " << inputs.size());
   for (auto in : inputs) {
     LOG_DEBUG("input debug name: " << in->debugName());
     // Disregarding inputs that are not tensors or are static
@@ -110,12 +112,9 @@ std::vector<const torch::jit::Value*> get_collection_inputs(
     // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in); // push original tuple
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(in);
-      LOG_DEBUG("Tuple size " << unpack_tuple.size());
-      // for (auto item: unpack_tuple) {
-      //   input_tensors.push_back(in);
-      // }
+      LOG_DEBUG("get_collection_inputs, tuple size " << unpack_tuple.size());
     } else if (in->type()->kind() == torch::jit::TypeKind::ListType && static_params.find(in) == static_params.end()) {
-      LOG_DEBUG("List use size " << in->uses().size());
+      LOG_DEBUG("get_collection_inputs, list use size " << in->uses().size());
       input_tensors.push_back(in); // push original list
     }
   }
@@ -248,13 +247,11 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
       torch::jit::Value* in = i;
       types.insert({in, {get_value_first_calc_dtype_opt(b, i)}});
     } else if(i->type()->kind() == torch::jit::TypeKind::TupleType) {
-      LOG_DEBUG("get_block_first_calc_dtypes_opt TupleType");
-
-      
-      // TODO: how to evaluate the data type of tuple element
+      LOG_DEBUG("get_block_first_calc_dtypes_opt_collection TupleType");
+      // TODO: to evaluate the data type of tuple element
       // make sure very time get the same ptr
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
-      LOG_DEBUG("get_block_first_calc_dtypes_opt: tuple size " << unpack_tuple.size());
+      LOG_DEBUG("get_block_first_calc_dtypes_opt_collection: tuple size " << unpack_tuple.size());
       std::vector<c10::optional<at::ScalarType>> empty_dytpes(unpack_tuple.size());
       types.insert({i, empty_dytpes}); // insert an empty 
       // for (auto item: unpack_tuple) {
@@ -263,10 +260,9 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
       // }
 
     } else if(i->type()->kind() == torch::jit::TypeKind::ListType) {
-      // TODO: how to evaluate the data type of tuple element
+      // TODO: to decide the size of list and type of list element
       LOG_DEBUG("get_block_first_calc_dtypes_opt ListType");
       types.insert({i, {}}); // insert an empty 
-      // LOG_INFO("Unsupported type of c10::ListType::ofTensors()");
 
     }
   }
diff --git a/core/ir/ir.h b/core/ir/ir.h
index 54e61a44a4..06e21fd53b 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -42,17 +42,17 @@ struct Input : torch::CustomClassHolder {
 
 // Add to spec
 struct GraphInputs {
-  GraphInputs() {}
-  GraphInputs(torch::jit::IValue inputs) {
-    input_signature = inputs;
-    // TODO flatten IValue
-  }
-  GraphInputs(std::vector<Input> inputs) {
-    flattened_inputs = inputs;
-    // TODO construct the IValue
-  }
+//   GraphInputs() {}
+//   GraphInputs(torch::jit::IValue inputs) {
+//     input_signature = inputs;
+//     // TODO flatten IValue
+//   }
+  // GraphInputs(std::vector<Input> inputs) {
+  //   flattened_inputs = inputs;
+  //   // TODO construct the IValue
+  // }
   torch::jit::IValue input_signature;  // nested Input, full input spec
-  std::vector<Input> flattened_inputs;  // flattend Input
+  std::vector<Input> flattened_inputs;  // flattend Input, can be removed
   std::vector<std::vector<Input>> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e)
 };
 
@@ -76,7 +76,7 @@ CollectionInputSpecMap associate_specs_with_collection_inputs(
     ir::GraphInputs graph_inputs,
     StaticParams& static_params);
 InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<Input> specs);
-CollectionInputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs);
+CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs);
 std::vector<const torch::jit::Value*> get_tensor_inputs(
     std::shared_ptr<torch::jit::Graph>& g,
     StaticParams& static_params);
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index 96b1312062..0f849c8871 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -8,27 +8,90 @@ namespace torch_tensorrt {
 namespace core {
 namespace partitioning {
 
+at::Tensor generateSingleInput(ir::Input& input, c10::optional<at::ScalarType>& type_opt) {
+      auto cur_shape = input.input_shape;
+      std::vector<int64_t> shape;
+      shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
+      // auto type_opt = types[input.first][i];
+      auto type = at::kFloat;
+      if (type_opt) {
+        type = type_opt.value();
+      } else {
+        LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32");
+      }
+      auto in = at::randint(5, shape, {at::kCUDA}).to(type);
+      // ivalue_map[input.first] = in.clone();
+      return in;
+}
+
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
-    std::unordered_map<const torch::jit::Value*, ir::Input>& inputs,
-    std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>& types) {
+// std::unordered_map<const torch::jit::Value*, std::vector<torch::jit::IValue>> generateRandomInputs(
+    // std::unordered_map<const torch::jit::Value*, ir::Input>& inputs,
+    std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& inputs,
+    // std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>& types) {
+    std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types) {
   // generate random inputs for running pytorch segments
   std::unordered_map<const torch::jit::Value*, torch::jit::IValue> ivalue_map;
-
-  uint64_t in_i = 0;
+  // std::unordered_map<const torch::jit::Value*, std::vector<torch::jit::IValue>> ivalue_map;
+  // TODO
+  // uint64_t in_i = 0;
   for (auto& input : inputs) {
-    auto cur_shape = input.second.input_shape;
-    std::vector<int64_t> shape;
-    shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
-    auto type_opt = types[input.first];
-    auto type = at::kFloat;
-    if (type_opt) {
-      type = type_opt.value();
+
+    // for (int i = 0; i < input.second.size(); i++) {
+    //   auto cur_shape = input.second[i].input_shape;
+    //   std::vector<int64_t> shape;
+    //   shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
+    //   auto type_opt = types[input.first][i];
+    //   auto type = at::kFloat;
+    //   if (type_opt) {
+    //     type = type_opt.value();
+    //   } else {
+    //     LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32");
+    //   }
+    //   auto in = at::randint(5, shape, {at::kCUDA}).to(type);
+    //   // ivalue_map[input.first] = in.clone();
+    //   ivalue_map[input.first].push_back(in.clone());
+    //   // in_i++;
+    // }
+
+    if (input.first->type()->kind() == torch::jit::TypeKind::ListType) {
+      // create list
+      // auto list = c10::impl::GenericList(c10::TensorType::get());
+      // list.append(ivalues_maps[input]);
+      LOG_DEBUG("generateRandomInputs, generate random input of list type");
+      // jit_inputs_ivalues.push_back(ivalues_maps[input].toList());
+      std::vector<torch::jit::IValue> list;
+      c10::TypePtr elementType = c10::TensorType::get();
+      auto generic_list = c10::impl::GenericList(elementType);
+      for (int i = 0; i < input.second.size(); i++) {
+        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        // list.push_back(in.clone());
+        generic_list.push_back(in.clone());
+      }
+      // c10::TypePtr elementType = list[0].type();
+
+      // generic_list.append(list);
+      ivalue_map[input.first] = generic_list;
+      // jit_inputs_ivalues.push_back(list);
+    } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
+      // create tuple
+      // auto tuple = torch::jit::Tuple::create(ivalues_maps[input]);
+      LOG_DEBUG("generateRandomInputs, generate random input of tuple type");
+      std::vector<torch::jit::IValue> list;
+      for (int i = 0; i < input.second.size(); i++) {
+        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        list.push_back(in.clone());
+      }
+      auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr
+      
+      ivalue_map[input.first] = c10::IValue(tuple);
+      // jit_inputs_ivalues.push_back(tuple);
     } else {
-      LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32");
+      LOG_DEBUG("generateRandomInputs, generate random input of tensor type");
+      auto in = generateSingleInput(input.second[0], types[input.first][0]);
+      ivalue_map[input.first] = in.clone();
+      
     }
-    auto in = at::randint(5, shape, {at::kCUDA}).to(type);
-    ivalue_map[input.first] = in.clone();
-    in_i++;
   }
   return ivalue_map;
 }
@@ -36,6 +99,7 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
 void getSegmentsOutputByRunning(
     SegmentedBlock& seg_block,
     std::unordered_map<const torch::jit::Value*, torch::jit::IValue>& ivalues_maps,
+    // std::unordered_map<const torch::jit::Value*, std::vector<torch::jit::IValue>>& ivalues_maps,
     const PartitionInfo& partition_info) {
   // create a module to run the graph
   auto g = seg_block.g();
@@ -79,8 +143,16 @@ void getSegmentsOutputByRunning(
     } else if (input->type()->isSubtypeOf(torch::jit::BoolType::get())) {
       jit_inputs_ivalues.push_back(ivalues_maps[input].toBool());
     } else if (input->type()->kind() == torch::jit::TypeKind::ListType) {
+      // create list
+      // auto list = c10::impl::GenericList(c10::TensorType::get());
+      // list.append(ivalues_maps[input]);
+      LOG_DEBUG("getSegmentsOutputByRunning, handle list type");
       jit_inputs_ivalues.push_back(ivalues_maps[input].toList());
+      // jit_inputs_ivalues.push_back(list);
     } else if (input->type()->kind() == torch::jit::TypeKind::TupleType) {
+      // create tuple
+      // auto tuple = torch::jit::Tuple::create(ivalues_maps[input]);
+      LOG_DEBUG("getSegmentsOutputByRunning, handle tuple type");
       jit_inputs_ivalues.push_back(ivalues_maps[input].toTuple());
     } else if (input->type()->kind() == torch::jit::TypeKind::NumberType) {
       jit_inputs_ivalues.push_back(ivalues_maps[input].toScalar());
@@ -141,6 +213,7 @@ void getSegmentsOutputByRunning(
       }
       input_types.push_back(cur_ivalue.toTensor().scalar_type());
     }
+    // TODO: tuple and list inputs in subgraph
   }
 
   seg_block.register_inshapes(input_shapes);
diff --git a/core/partitioning/shape_analysis.h b/core/partitioning/shape_analysis.h
index 0626490222..46450eb0f8 100644
--- a/core/partitioning/shape_analysis.h
+++ b/core/partitioning/shape_analysis.h
@@ -6,9 +6,13 @@ namespace torch_tensorrt {
 namespace core {
 namespace partitioning {
 
+// std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
+//     std::unordered_map<const torch::jit::Value*, ir::Input>& input_ranges,
+//     std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>& input_types);
+
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
-    std::unordered_map<const torch::jit::Value*, ir::Input>& input_ranges,
-    std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>& input_types);
+    std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& input_ranges,
+    std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& input_types);
 
 void runShapeAnalysis(
     std::vector<SegmentedBlock>& segmented_blocks,
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
index 63dc96e654..1ee8dde3c9 100644
--- a/cpp/include/torch_tensorrt/torch_tensorrt.h
+++ b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -521,7 +521,7 @@ struct TORCHTRT_API Input : torch::CustomClassHolder{
  */
 struct TORCHTRT_API GraphInputs {
   torch::jit::IValue input_signature;   // nested Input, full input spec
-  std::vector<Input> flattened_inputs;  // flattend Input
+  // std::vector<Input> flattened_inputs;  // flattend Input
 };
 
 /**
@@ -599,9 +599,7 @@ struct TORCHTRT_API CompileSpec {
    *
    * @param inputs
    */
-  CompileSpec(torch::jit::IValue input_signature) {
-    graph_inputs.input_signature = input_signature;
-  }
+  CompileSpec(torch::jit::IValue input_signature);
   // Defaults should reflect TensorRT defaults for BuilderConfig
 
   /**
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 74de9e4801..beac217677 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -20,82 +20,123 @@ CompileSpec::CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes) {
   for (auto in : fixed_sizes) {
     inputs.push_back(Input(in));
   }
-  graph_inputs.flattened_inputs = inputs;
+  // graph_inputs.flattened_inputs = inputs;
 }
 
 CompileSpec::CompileSpec(std::vector<std::vector<int64_t>> fixed_sizes) {
   for (auto in : fixed_sizes) {
     inputs.push_back(Input(in));
   }
-  graph_inputs.flattened_inputs = inputs;
+  // graph_inputs.flattened_inputs = inputs;
 }
 
-void flatten_dfs(std::vector<torchtrt::core::ir::Input>& flattened_inputs, torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) {
+CompileSpec::CompileSpec(torch::jit::IValue input_signature) {
+    graph_inputs.input_signature = input_signature;
+}
+
+void flatten_dfs(std::vector<torchtrt::core::ir::Input>& flattened_inputs, std::vector<std::vector<torchtrt::core::ir::Input>>& collection_inputs, 
+                 torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue, int level, int index) {
     if (input_ivalue.isTuple()) {
       auto input_tuple = input_ivalue.toTuple();
       std::vector<torch::jit::IValue> converted_elements;
+      int idx = 0;
+      if (level == 0) {
+        collection_inputs.resize(input_tuple->elements().size());
+      }
       for (auto item: input_tuple->elements()) {
         torch::jit::IValue converted_item;
-        flatten_dfs(flattened_inputs, item, converted_item);
+        int cur_idx = level < 1 ? idx: index;
+        flatten_dfs(flattened_inputs, collection_inputs, item, converted_item, level+1, cur_idx);
         converted_elements.push_back(converted_item);
         auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
         converted_ivalue = torch::jit::IValue(tuple_ptr);
+        idx++;
       }
     } else if(input_ivalue.isList()) {
       auto input_list = input_ivalue.toList().vec();
+      if (level == 0) {
+        collection_inputs.resize(input_list.size());
+      }
       c10::TypePtr type = input_list[0].type();
       auto converted_elements = c10::impl::GenericList(type);
       // std::vector<torch::jit::IValue> converted_elements;
+      int idx = 0;
       for (auto item: input_list) {
+        int cur_idx = level < 1 ? idx: index;
         torch::jit::IValue converted_item;
-        flatten_dfs(flattened_inputs, item, converted_item);
+        flatten_dfs(flattened_inputs, collection_inputs, item, converted_item, level+1, cur_idx);
         converted_elements.push_back(converted_item);
+        idx++;
       }
       converted_ivalue = torch::jit::IValue(converted_elements);
     } else if(input_ivalue.isCustomClass()) {
       torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass<torchtrt::Input>()));
       flattened_inputs.push_back(cur_input);
       converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::core::ir::Input>(cur_input)));
+      if (level == 0) {  // a single value like A
+        collection_inputs.resize(1);
+        collection_inputs[0].push_back(cur_input);
+      } else if (level == 1) { // like A in [A, A] or [(B, B), A]
+        collection_inputs[index].push_back(cur_input);
+      } else if (level == 2) {  // like A in [(A, A), C]
+        collection_inputs[index].push_back(cur_input);
+      } else {// only support 2 level
+        LOG_ERROR("3 level of input specs is not supported");
+      }
     }
 }
 
+
 torch_tensorrt::core::ir::GraphInputs to_internal_graph_inputs(GraphInputs external_graph_input) {
   torch_tensorrt::core::ir::GraphInputs internal_graph_input;
 
-  // flattened version
-  if (external_graph_input.flattened_inputs.size() > 0) {
-    // std::vector<torch::jit::IValue> input_shape_list;
-    auto empty_ivalue = torch::jit::IValue(c10::make_intrusive<torchtrt::core::ir::Input>(torchtrt::core::ir::Input()));
-    c10::TypePtr type = empty_ivalue.type();
-    auto input_shape_list = c10::impl::GenericList(type);
-    std::vector<torchtrt::core::ir::Input> internal_input = to_vec_internal_inputs(external_graph_input.flattened_inputs);
-    for (auto input_shape: internal_input) {
-      auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torchtrt::core::ir::Input>(input_shape)));
-      input_shape_list.push_back(input_shape_ivalue);
-    }
-
-    torch::jit::IValue input_signature(input_shape_list);
-    internal_graph_input.flattened_inputs = internal_input;
-    internal_graph_input.input_signature = input_signature;
+  // // flattened version
+  // if (external_graph_input.flattened_inputs.size() > 0) {
+  //   // std::vector<torch::jit::IValue> input_shape_list;
+  //   auto empty_ivalue = torch::jit::IValue(c10::make_intrusive<torchtrt::core::ir::Input>(torchtrt::core::ir::Input()));
+  //   c10::TypePtr type = empty_ivalue.type();
+  //   auto input_shape_list = c10::impl::GenericList(type);
+  //   std::vector<torchtrt::core::ir::Input> internal_input = to_vec_internal_inputs(external_graph_input.flattened_inputs);
+  //   for (auto input_shape: internal_input) {
+  //     auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torchtrt::core::ir::Input>(input_shape)));
+  //     input_shape_list.push_back(input_shape_ivalue);
+  //   }
+
+  //   torch::jit::IValue input_signature(input_shape_list);
+  //   internal_graph_input.flattened_inputs = internal_input;
+  //   internal_graph_input.input_signature = input_signature;
     
-  }
-  // nested version
-  else {
+  // }
+  // // nested version
+  // else {
     std::vector<torchtrt::core::ir::Input> flattened_inputs;
-    torch::jit::IValue input_signature;
-    flatten_dfs(flattened_inputs, external_graph_input.input_signature, input_signature);
+    std::vector<std::vector<torchtrt::core::ir::Input>> collection_inputs;
+
+    torch::jit::IValue converted_input_signature;
+    flatten_dfs(flattened_inputs, collection_inputs, external_graph_input.input_signature, converted_input_signature, 0, 0);
     internal_graph_input.flattened_inputs = flattened_inputs;
-    internal_graph_input.input_signature = input_signature;
-    printf("in nested version branch\n");
+    internal_graph_input.input_signature = converted_input_signature;
+    internal_graph_input.collection_inputs = collection_inputs;
+
+    LOG_DEBUG("compile_spec.cpp, to_internal_graph_inputs, flattened_inputs size " << flattened_inputs.size() << ", collection_inputs size "<< collection_inputs.size());
 
-  }
   return internal_graph_input;
 }
 
 torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
   torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.inputs));
-  internal.graph_inputs = to_internal_graph_inputs(external.graph_inputs);
-  internal.inputs = internal.graph_inputs.flattened_inputs;
+  if (internal.inputs.size() == 0) {
+    LOG_DEBUG("to_internal_compile_spec, Input size == 0, using graph_input");
+    internal.graph_inputs = to_internal_graph_inputs(external.graph_inputs);
+    internal.inputs = internal.graph_inputs.flattened_inputs;
+  } else {
+    LOG_DEBUG("to_internal_compile_spec, Input size != 0, using original Input to construct collection_input");
+    internal.graph_inputs.collection_inputs.resize(internal.inputs.size());
+    for (int i = 0; i < internal.inputs.size(); i++) {
+      internal.graph_inputs.collection_inputs[i].push_back(internal.inputs[i]);
+    }
+  }
+
 
   for (auto p : external.enabled_precisions) {
     internal.convert_info.engine_settings.enabled_precisions.insert(toTRTDataType(p));
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index 019dd6c7f1..c339b25645 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -70,10 +70,14 @@ TEST(CppAPITests, TestCollection) {
 
 
   torch::jit::IValue complex_input_shape(input_shape_tuple);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
   // torch::jit::IValue complex_input_shape(list);
 
-  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape);
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
   compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 1;
+
   // compile_settings.torch_executed_modules.push_back("model1");
   // compile_settings.torch_executed_ops.push_back("aten::sub");
 

From b1d66cb73364ff92694145979bcbdc65abafc31d Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 10 Mar 2022 19:21:22 +0800
Subject: [PATCH 04/22] fix: [collection] test normal input, fix bug

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/conversion/conversion.cpp | 19 +++++--
 tests/cpp/test_collection.cpp  | 56 ++++++++++++++++++---
 tests/py/test_collection.py    | 91 +++++++++++++++++++++-------------
 3 files changed, 121 insertions(+), 45 deletions(-)

diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 56e484e898..bafde231a1 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -134,7 +134,11 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
 void AddInputs(
     ConversionCtx* ctx,
     c10::ArrayRef<const torch::jit::Value*> inputs,
-    std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs) {
+    ConversionInfo& conversion_info) {
+    // std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs) {
+  std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs = conversion_info.inputs;
+  std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>> collection_input_spec = conversion_info.collection_inputs;
+  
   std::vector<const torch::jit::Value*> input_tensors;
   for (auto in : inputs) {
     // Disregarding inputs that are not tensors
@@ -162,9 +166,15 @@ void AddInputs(
   for (auto input : input_tensors) {
     const torch::jit::Value* in = input;
     TORCHTRT_CHECK(
-        input_specs.find(in) != input_specs.end(),
+        input_specs.find(in) != input_specs.end() || collection_input_spec.find(in) != collection_input_spec.end(),
         "Cannot find an input spec associated with input: " << in->debugName());
-    ir::Input& spec = input_specs.find(in)->second;
+    ir::Input spec;
+    if (input_specs.find(in) != input_specs.end()) {
+        spec = input_specs.find(in)->second;
+    } else {
+      spec = collection_input_spec.find(in)->second[0]; // assume input is tensor
+    }
+    // ir::Input& spec = input_specs.find(in)->second;
 
     std::string name = std::string("input_") + std::to_string(ctx->num_inputs);
     LOG_INFO(
@@ -405,7 +415,8 @@ void ConvertBlockToNetDef(
 
   auto inputs = b->inputs();
   AddParamsToCtxValueMap(ctx, static_params);
-  AddInputs(ctx, inputs, build_info.inputs);
+  // AddInputs(ctx, inputs, build_info.inputs);
+  AddInputs(ctx, inputs, build_info);
 
   auto nodes = b->nodes();
 
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index c339b25645..b9e92cd732 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -6,15 +6,10 @@
 #include "torch_tensorrt/torch_tensorrt.h"
 
 
-TEST(CppAPITests, TestCollection) {
-
+TEST(CppAPITests, TestCollectionTupleInput) {
 
   std::string path =
-  // "/opt/trtorch/tuple2model.ts";
-  // "/opt/trtorch/tuple2_list2_v3.ts";
-  // "/opt/trtorch/tuple2_tuple2_v3.ts";
-  "/opt/trtorch/tuple2_v3.ts";
-  // "/opt/trtorch/list2_list2_v3.ts";
+  "/root/Torch-TensorRT/tuple_input.ts";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -91,5 +86,52 @@ TEST(CppAPITests, TestCollection) {
   // auto trt_out = trt_mod.forward(complex_inputs_list);
 
 
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+}
+
+
+TEST(CppAPITests, TestCollectionNormalInput) {
+
+  std::string path =
+  "/root/Torch-TensorRT/normal_model.ts";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+  
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+  auto out = mod.forward(inputs_);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  std::vector<torch_tensorrt::Input> input_range;
+  input_range.push_back({in0.sizes(), torch::kF32});
+  input_range.push_back({in0.sizes(), torch::kF32});
+  torch_tensorrt::ts::CompileSpec compile_settings(input_range);
+  compile_settings.require_full_compilation = true;
+  compile_settings.min_block_size = 1;
+
+  // // FP16 execution
+  // compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(inputs_);
+
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
\ No newline at end of file
diff --git a/tests/py/test_collection.py b/tests/py/test_collection.py
index 610bf9fe9c..03aa4aeb41 100644
--- a/tests/py/test_collection.py
+++ b/tests/py/test_collection.py
@@ -4,52 +4,75 @@
 import torch.nn.functional as F
 from typing import Tuple, List, Dict
 
-class Model1(nn.Module):
-    def __init__(self):
-        super(Model1, self).__init__()
+# class Model1(nn.Module):
+#     def __init__(self):
+#         super(Model1, self).__init__()
 
-    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
-        r = z[1] + z[0]
-        return r, z[1]
+#     def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+#         r = z[1] + z[0]
+#         return r, z[1]
+
+
+# class TestModel1(nn.Module):
+#     def __init__(self):
+#         super(TestModel, self).__init__()
+#         self.model1 = Model1()
 
+#     def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+#         r2, r1  = self.model1((z[0], z[1]))
+#         # unsupport ops
+#         i = r2.size(1)
+#         j = r2.size(2)
+# #         r3 = torch.tensor(i) * torch.tensor(j)
+#         r3 = r2[0,0,0,0]
+#         k = int(r3) - 5
 
-class TestModel1(nn.Module):
+# #         if k > 0:
+#         r = r1 - k
+#         result = (r, r1)
+# #         else:
+# #             r = r1 - k
+# #             result = (r1, r)
+#         return result
+
+class Normal(nn.Module):
     def __init__(self):
-        super(TestModel, self).__init__()
-        self.model1 = Model1()
+        super(Normal, self).__init__()
 
-    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
-        r2, r1  = self.model1((z[0], z[1]))
-        # unsupport ops
-        i = r2.size(1)
-        j = r2.size(2)
-#         r3 = torch.tensor(i) * torch.tensor(j)
-        r3 = r2[0,0,0,0]
-        k = int(r3) - 5
-
-#         if k > 0:
-        r = r1 - k
-        result = (r, r1)
-#         else:
-#             r = r1 - k
-#             result = (r1, r)
-        return result
-
-class TestModel(nn.Module):
+    def forward(self, x, y):
+        r = x + y
+        return r
+
+class TupleInput(nn.Module):
     def __init__(self):
-        super(TestModel, self).__init__()
+        super(TupleInput, self).__init__()
 
     def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
         r = z[0] + z[1]
         return r
 
-test_model = TestModel()
+class ListInput(nn.Module):
+    def __init__(self):
+        super(ListInput, self).__init__()
+
+    def forward(self, z: List[torch.Tensor]):
+        r = z[0] + z[1]
+        return r
 
-ts = torch.jit.script(test_model)
-print(ts.graph)
 
-ts.to("cuda").eval()
 input_data = torch.randn((16, 3, 32, 32))
 input_data = input_data.float().to("cuda")
-result = ts((input_data, input_data))
-torch.jit.save(ts, "./tuple2_v3.ts")
\ No newline at end of file
+
+normal_model = Normal()
+normal_model_ts = torch.jit.script(normal_model)
+print(normal_model_ts.graph)
+result = normal_model_ts(input_data, input_data)
+normal_model_ts.to("cuda").eval()
+torch.jit.save(normal_model_ts, "./normal_model.ts")
+
+tuple_input = TupleInput()
+tuple_input_ts = torch.jit.script(tuple_input)
+print(tuple_input_ts.graph)
+result = tuple_input_ts((input_data, input_data))
+tuple_input_ts.to("cuda").eval()
+torch.jit.save(tuple_input_ts, "./tuple_input.ts")
\ No newline at end of file

From d4e54f12b659f47423f130cba5d6e15bad1bca99 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 10 Mar 2022 21:18:02 +0800
Subject: [PATCH 05/22] feat: [collection] support list input type

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/partitioning/shape_analysis.cpp | 13 +++-
 tests/cpp/test_collection.cpp        | 91 ++++++++++++++++++++++++----
 tests/py/test_collection.py          |  9 ++-
 3 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index 0f849c8871..a6459ebc6f 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -63,16 +63,23 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
       std::vector<torch::jit::IValue> list;
       c10::TypePtr elementType = c10::TensorType::get();
       auto generic_list = c10::impl::GenericList(elementType);
+      LOG_DEBUG("generateRandomInputs, 0");
       for (int i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        // types for list is {}
+        // auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        // TODO: need to decide the input type of list elements in ir.cpp
+        c10::optional<at::ScalarType> type_opt = {};
+        auto in = generateSingleInput(input.second[i], type_opt);
         // list.push_back(in.clone());
         generic_list.push_back(in.clone());
+        LOG_DEBUG("generateRandomInputs, 1");
       }
       // c10::TypePtr elementType = list[0].type();
-
+      LOG_DEBUG("generateRandomInputs, 2");
       // generic_list.append(list);
-      ivalue_map[input.first] = generic_list;
+      ivalue_map[input.first] = c10::IValue(generic_list);
       // jit_inputs_ivalues.push_back(list);
+      LOG_DEBUG("generateRandomInputs, finish generate random input of list type");
     } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
       // auto tuple = torch::jit::Tuple::create(ivalues_maps[input]);
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index b9e92cd732..7d3b54152b 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -33,7 +33,7 @@ TEST(CppAPITests, TestCollectionTupleInput) {
 
 
   std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
-  std::vector<torch::jit::IValue> tuple;
+  // std::vector<torch::jit::IValue> tuple;
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_tuple(in0, in0);
   // auto input_list = c10::impl::GenericList(c10::TensorType::get());
   // input_list.push_back(inputs_[0]);
@@ -42,8 +42,8 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   // torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
 
   complex_inputs.push_back(input_tuple);
-  complex_inputs_list.push_back(in0);
-  complex_inputs_list.push_back(in0);
+  // complex_inputs_list.push_back(in0);
+  // complex_inputs_list.push_back(in0);
 
 
 
@@ -56,10 +56,10 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
 
-  c10::TypePtr elementType = input_shape_ivalue.type();
-  auto list = c10::impl::GenericList(elementType);
-  list.push_back(input_shape_ivalue);
-  list.push_back(input_shape_ivalue);
+  // c10::TypePtr elementType = input_shape_ivalue.type();
+  // auto list = c10::impl::GenericList(elementType);
+  // list.push_back(input_shape_ivalue);
+  // list.push_back(input_shape_ivalue);
 
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
 
@@ -73,10 +73,6 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   compile_settings.require_full_compilation = false;
   compile_settings.min_block_size = 1;
 
-  // compile_settings.torch_executed_modules.push_back("model1");
-  // compile_settings.torch_executed_ops.push_back("aten::sub");
-
-
   // // FP16 execution
   // compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
@@ -133,5 +129,78 @@ TEST(CppAPITests, TestCollectionNormalInput) {
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(inputs_);
 
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+}
+
+
+
+TEST(CppAPITests, TestCollectionListInput) {
+
+  std::string path =
+  "/root/Torch-TensorRT/list_input.ts";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+  
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+  std::vector<torch::jit::IValue> complex_inputs;
+  auto input_list = c10::impl::GenericList(c10::TensorType::get());
+  input_list.push_back(inputs_[0]);
+  input_list.push_back(inputs_[0]);
+
+  torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
+
+  complex_inputs.push_back(input_list_ivalue);
+
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+
+  c10::TypePtr elementType = input_shape_ivalue.type();
+  auto list = c10::impl::GenericList(elementType);
+  list.push_back(input_shape_ivalue);
+  list.push_back(input_shape_ivalue);
+
+
+  torch::jit::IValue complex_input_shape(list);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 1;
+  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+
+  // // FP16 execution
+  // compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(complex_inputs);
+  // auto trt_out = trt_mod.forward(complex_inputs_list);
+
+
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
\ No newline at end of file
diff --git a/tests/py/test_collection.py b/tests/py/test_collection.py
index 03aa4aeb41..91eca4c854 100644
--- a/tests/py/test_collection.py
+++ b/tests/py/test_collection.py
@@ -75,4 +75,11 @@ def forward(self, z: List[torch.Tensor]):
 print(tuple_input_ts.graph)
 result = tuple_input_ts((input_data, input_data))
 tuple_input_ts.to("cuda").eval()
-torch.jit.save(tuple_input_ts, "./tuple_input.ts")
\ No newline at end of file
+torch.jit.save(tuple_input_ts, "./tuple_input.ts")
+
+list_input = ListInput()
+list_input_ts = torch.jit.script(list_input)
+print(list_input_ts.graph)
+result = list_input_ts([input_data, input_data])
+list_input_ts.to("cuda").eval()
+torch.jit.save(list_input_ts, "./list_input.ts")
\ No newline at end of file

From a9aa2e74cf041ec35aa022f1af795d832c193176 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Wed, 16 Mar 2022 18:08:26 +0800
Subject: [PATCH 06/22] feat: [collection] support user defined input data type

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.cpp                    | 16 ++++++++--
 core/ir/ir.cpp                       | 14 +++++---
 core/ir/ir.h                         |  2 +-
 core/partitioning/shape_analysis.cpp |  5 +--
 cpp/src/compile_spec.cpp             | 20 +-----------
 cpp/src/torch_tensorrt.cpp           |  1 -
 tests/cpp/test_collection.cpp        | 48 +++++++++++++---------------
 7 files changed, 51 insertions(+), 55 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index a431bcdae3..1c8ed34762 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -343,10 +343,22 @@ void MapInputsAndDetermineDTypes(
           spec[i].dtype = nvinfer1::DataType::kFLOAT;
         } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) {
           if (!est_type_opt[i]) {
-            LOG_INFO("Cannot infer input tensor dtype in graph, unable to verify user input dtype settings");
+            LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
+            // TODO set input data type
+
+            std::stringstream ss;
+            ss << "For input " << in->debugName() << ", found user specified input dtype as ";
+            ss << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
+            // ss << cfg.convert_info.inputs.find(in)->second.dtype;
+            ss << ". The compiler is going to use the user setting " << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
+            auto warn_str = ss.str();
+            LOG_WARNING(warn_str);
+            // Overwrite type map with user settings
+            first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype)};
+
           } else {
             // if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
-              if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype) != est_type_opt[i].value()) {
+            if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype) != est_type_opt[i].value()) {
               std::stringstream ss;
               ss << "For input " << in->debugName() << ", found user specified input dtype as ";
               ss << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index a1a49ba3ca..52bd92a17f 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -250,10 +250,13 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
       LOG_DEBUG("get_block_first_calc_dtypes_opt_collection TupleType");
       // TODO: to evaluate the data type of tuple element
       // make sure very time get the same ptr
+      c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
       LOG_DEBUG("get_block_first_calc_dtypes_opt_collection: tuple size " << unpack_tuple.size());
-      std::vector<c10::optional<at::ScalarType>> empty_dytpes(unpack_tuple.size());
-      types.insert({i, empty_dytpes}); // insert an empty 
+      // Assume all tuple has the same datatype
+      // std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size(), tp);
+      std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size());
+      types.insert({i, dytpes}); // insert an empty 
       // for (auto item: unpack_tuple) {
       //   torch::jit::Value* in = item;
       //   types.insert({in, get_value_first_calc_dtype_opt(b, i)});
@@ -261,8 +264,11 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
 
     } else if(i->type()->kind() == torch::jit::TypeKind::ListType) {
       // TODO: to decide the size of list and type of list element
-      LOG_DEBUG("get_block_first_calc_dtypes_opt ListType");
-      types.insert({i, {}}); // insert an empty 
+      LOG_DEBUG("get_block_first_calc_dtypes_opt ListType: use size " << i->uses().size());
+      c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
+      // std::vector<c10::optional<at::ScalarType>> dytpes(i->uses().size());
+      std::vector<c10::optional<at::ScalarType>> dytpes(i->uses().size(), tp);
+      types.insert({i, dytpes}); // insert an empty
 
     }
   }
diff --git a/core/ir/ir.h b/core/ir/ir.h
index 06e21fd53b..69c70263ed 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -52,7 +52,7 @@ struct GraphInputs {
   //   // TODO construct the IValue
   // }
   torch::jit::IValue input_signature;  // nested Input, full input spec
-  std::vector<Input> flattened_inputs;  // flattend Input, can be removed
+  std::vector<Input> flattened_inputs;  // flattend Input
   std::vector<std::vector<Input>> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e)
 };
 
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index a6459ebc6f..6d69275e3a 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -68,8 +68,9 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
         // types for list is {}
         // auto in = generateSingleInput(input.second[i], types[input.first][i]);
         // TODO: need to decide the input type of list elements in ir.cpp
-        c10::optional<at::ScalarType> type_opt = {};
-        auto in = generateSingleInput(input.second[i], type_opt);
+        // c10::optional<at::ScalarType> type_opt = {};
+        // auto in = generateSingleInput(input.second[i], type_opt);
+        auto in = generateSingleInput(input.second[i], types[input.first][i]);
         // list.push_back(in.clone());
         generic_list.push_back(in.clone());
         LOG_DEBUG("generateRandomInputs, 1");
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index beac217677..5e015e3a6e 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -90,25 +90,6 @@ void flatten_dfs(std::vector<torchtrt::core::ir::Input>& flattened_inputs, std::
 torch_tensorrt::core::ir::GraphInputs to_internal_graph_inputs(GraphInputs external_graph_input) {
   torch_tensorrt::core::ir::GraphInputs internal_graph_input;
 
-  // // flattened version
-  // if (external_graph_input.flattened_inputs.size() > 0) {
-  //   // std::vector<torch::jit::IValue> input_shape_list;
-  //   auto empty_ivalue = torch::jit::IValue(c10::make_intrusive<torchtrt::core::ir::Input>(torchtrt::core::ir::Input()));
-  //   c10::TypePtr type = empty_ivalue.type();
-  //   auto input_shape_list = c10::impl::GenericList(type);
-  //   std::vector<torchtrt::core::ir::Input> internal_input = to_vec_internal_inputs(external_graph_input.flattened_inputs);
-  //   for (auto input_shape: internal_input) {
-  //     auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torchtrt::core::ir::Input>(input_shape)));
-  //     input_shape_list.push_back(input_shape_ivalue);
-  //   }
-
-  //   torch::jit::IValue input_signature(input_shape_list);
-  //   internal_graph_input.flattened_inputs = internal_input;
-  //   internal_graph_input.input_signature = input_signature;
-    
-  // }
-  // // nested version
-  // else {
     std::vector<torchtrt::core::ir::Input> flattened_inputs;
     std::vector<std::vector<torchtrt::core::ir::Input>> collection_inputs;
 
@@ -134,6 +115,7 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
     internal.graph_inputs.collection_inputs.resize(internal.inputs.size());
     for (int i = 0; i < internal.inputs.size(); i++) {
       internal.graph_inputs.collection_inputs[i].push_back(internal.inputs[i]);
+      internal.graph_inputs.flattened_inputs = internal.inputs;
     }
   }
 
diff --git a/cpp/src/torch_tensorrt.cpp b/cpp/src/torch_tensorrt.cpp
index 9d6f271332..93813190ab 100644
--- a/cpp/src/torch_tensorrt.cpp
+++ b/cpp/src/torch_tensorrt.cpp
@@ -30,7 +30,6 @@ torch::jit::script::Module compile(const torch::jit::script::Module& module, Com
   LOG_DEBUG(get_build_info());
   // Want to export a much simpler (non TRT header dependent) API so doing the
   // type conversion here
-  printf("in torch_tensorrt::ts::compile\n");
   return torch_tensorrt::core::CompileGraph(module, to_internal_compile_spec(info));
 }
 
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index 7d3b54152b..6ee0a78871 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -10,9 +10,10 @@ TEST(CppAPITests, TestCollectionTupleInput) {
 
   std::string path =
   "/root/Torch-TensorRT/tuple_input.ts";
-  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
-  std::vector<at::Tensor> inputs;
-  inputs.push_back(in0);
+  // torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+  // std::vector<at::Tensor> inputs;
+  // inputs.push_back(in0);
 
   torch::jit::Module mod;
   try {
@@ -23,13 +24,13 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   }
   mod.eval();
   mod.to(torch::kCUDA);
-  
 
-  std::vector<torch::jit::IValue> inputs_;
 
-  for (auto in : inputs) {
-    inputs_.push_back(torch::jit::IValue(in.clone()));
-  }
+  // std::vector<torch::jit::IValue> inputs_;
+
+  // for (auto in : inputs) {
+  //   inputs_.push_back(torch::jit::IValue(in.clone()));
+  // }
 
 
   std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
@@ -42,16 +43,12 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   // torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
 
   complex_inputs.push_back(input_tuple);
-  // complex_inputs_list.push_back(in0);
-  // complex_inputs_list.push_back(in0);
-
-
 
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
-
-  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
@@ -63,7 +60,6 @@ TEST(CppAPITests, TestCollectionTupleInput) {
 
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
 
-
   torch::jit::IValue complex_input_shape(input_shape_tuple);
   std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
   torch::jit::IValue complex_input_shape2(input_tuple2);
@@ -74,13 +70,12 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   compile_settings.min_block_size = 1;
 
   // // FP16 execution
-  // compile_settings.enabled_precisions = {torch::kHalf};
+  compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
-  // auto trt_out = trt_mod.forward(complex_inputs_list);
-
+  // std::cout << out.toTensor() << std::endl;
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
@@ -90,7 +85,7 @@ TEST(CppAPITests, TestCollectionNormalInput) {
 
   std::string path =
   "/root/Torch-TensorRT/normal_model.ts";
-  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
   inputs.push_back(in0);
@@ -116,14 +111,14 @@ TEST(CppAPITests, TestCollectionNormalInput) {
   LOG_DEBUG("Finish torchscirpt forward");
 
   std::vector<torch_tensorrt::Input> input_range;
-  input_range.push_back({in0.sizes(), torch::kF32});
-  input_range.push_back({in0.sizes(), torch::kF32});
+  input_range.push_back({in0.sizes(), torch::kF16});
+  input_range.push_back({in0.sizes(), torch::kF16});
   torch_tensorrt::ts::CompileSpec compile_settings(input_range);
   compile_settings.require_full_compilation = true;
   compile_settings.min_block_size = 1;
 
   // // FP16 execution
-  // compile_settings.enabled_precisions = {torch::kHalf};
+  compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
@@ -138,7 +133,7 @@ TEST(CppAPITests, TestCollectionListInput) {
 
   std::string path =
   "/root/Torch-TensorRT/list_input.ts";
-  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
 
@@ -173,7 +168,8 @@ TEST(CppAPITests, TestCollectionListInput) {
   LOG_DEBUG("Finish torchscirpt forward");
 
 
-  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
@@ -194,13 +190,13 @@ TEST(CppAPITests, TestCollectionListInput) {
   compile_settings.torch_executed_ops.push_back("aten::__getitem__");
 
   // // FP16 execution
-  // compile_settings.enabled_precisions = {torch::kHalf};
+  compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
   // auto trt_out = trt_mod.forward(complex_inputs_list);
 
-
+  // std::cout << out.toTensor() << std::endl;
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
\ No newline at end of file

From 5830cbe99ea488a1f082f88b14b8d9873003c4a8 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 17 Mar 2022 16:46:30 +0800
Subject: [PATCH 07/22] feat: [collection] support output type of list and
 tuple

Signed-off-by: inocsin <vcheungyi@163.com>
---
 tests/cpp/test_collection.cpp | 180 +++++++++++++++++++++++++++++++---
 tests/py/test_collection.py   |  35 ++++++-
 2 files changed, 201 insertions(+), 14 deletions(-)

diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index 6ee0a78871..0533b7ae77 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -6,6 +6,52 @@
 #include "torch_tensorrt/torch_tensorrt.h"
 
 
+TEST(CppAPITests, TestCollectionNormalInput) {
+
+  std::string path =
+  "/root/Torch-TensorRT/normal_model.ts";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+  
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+  auto out = mod.forward(inputs_);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  std::vector<torch_tensorrt::Input> input_range;
+  input_range.push_back({in0.sizes(), torch::kF16});
+  input_range.push_back({in0.sizes(), torch::kF16});
+  torch_tensorrt::ts::CompileSpec compile_settings(input_range);
+  compile_settings.require_full_compilation = true;
+  compile_settings.min_block_size = 1;
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(inputs_);
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+}
+
 TEST(CppAPITests, TestCollectionTupleInput) {
 
   std::string path =
@@ -81,14 +127,13 @@ TEST(CppAPITests, TestCollectionTupleInput) {
 }
 
 
-TEST(CppAPITests, TestCollectionNormalInput) {
+TEST(CppAPITests, TestCollectionListInput) {
 
   std::string path =
-  "/root/Torch-TensorRT/normal_model.ts";
+  "/root/Torch-TensorRT/list_input.ts";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
-  inputs.push_back(in0);
 
   torch::jit::Module mod;
   try {
@@ -107,32 +152,136 @@ TEST(CppAPITests, TestCollectionNormalInput) {
     inputs_.push_back(torch::jit::IValue(in.clone()));
   }
 
-  auto out = mod.forward(inputs_);
+  std::vector<torch::jit::IValue> complex_inputs;
+  auto input_list = c10::impl::GenericList(c10::TensorType::get());
+  input_list.push_back(inputs_[0]);
+  input_list.push_back(inputs_[0]);
+
+  torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
+
+  complex_inputs.push_back(input_list_ivalue);
+
+
+  auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
-  std::vector<torch_tensorrt::Input> input_range;
-  input_range.push_back({in0.sizes(), torch::kF16});
-  input_range.push_back({in0.sizes(), torch::kF16});
-  torch_tensorrt::ts::CompileSpec compile_settings(input_range);
-  compile_settings.require_full_compilation = true;
+
+  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+
+  c10::TypePtr elementType = input_shape_ivalue.type();
+  auto list = c10::impl::GenericList(elementType);
+  list.push_back(input_shape_ivalue);
+  list.push_back(input_shape_ivalue);
+
+
+  torch::jit::IValue complex_input_shape(list);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
   compile_settings.min_block_size = 1;
+  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
   // // Compile module
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
-  auto trt_out = trt_mod.forward(inputs_);
+  auto trt_out = trt_mod.forward(complex_inputs);
+  // auto trt_out = trt_mod.forward(complex_inputs_list);
 
+  // std::cout << out.toTensor() << std::endl;
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
 
 
+TEST(CppAPITests, TestCollectionTupleInputOutput) {
 
-TEST(CppAPITests, TestCollectionListInput) {
+  std::string path =
+  "/root/Torch-TensorRT/tuple_input_output.ts";
+  // torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+  // std::vector<at::Tensor> inputs;
+  // inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+
+
+  // std::vector<torch::jit::IValue> inputs_;
+
+  // for (auto in : inputs) {
+  //   inputs_.push_back(torch::jit::IValue(in.clone()));
+  // }
+
+
+  std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
+  // std::vector<torch::jit::IValue> tuple;
+  std::tuple<torch::jit::IValue, torch::jit::IValue> input_tuple(in0, in0);
+  // auto input_list = c10::impl::GenericList(c10::TensorType::get());
+  // input_list.push_back(inputs_[0]);
+  // input_list.push_back(inputs_[0]);
+
+  // torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
+
+  complex_inputs.push_back(input_tuple);
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+
+  // c10::TypePtr elementType = input_shape_ivalue.type();
+  // auto list = c10::impl::GenericList(elementType);
+  // list.push_back(input_shape_ivalue);
+  // list.push_back(input_shape_ivalue);
+
+  std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
+
+  torch::jit::IValue complex_input_shape(input_shape_tuple);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+  // torch::jit::IValue complex_input_shape(list);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 1;
+
+  // compile_settings.torch_executed_ops.push_back("prim::TupleConstruct");
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(complex_inputs);
+  // std::cout << out.toTensor() << std::endl;
+
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
+}
+
+
+TEST(CppAPITests, TestCollectionListInputOutput) {
 
   std::string path =
-  "/root/Torch-TensorRT/list_input.ts";
+  "/root/Torch-TensorRT/list_input_output.ts";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -187,7 +336,10 @@ TEST(CppAPITests, TestCollectionListInput) {
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
   compile_settings.require_full_compilation = false;
   compile_settings.min_block_size = 1;
+
+  // Need to skip the conversion of __getitem__ and ListConstruct
   compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+  compile_settings.torch_executed_ops.push_back("prim::ListConstruct");
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
@@ -198,5 +350,7 @@ TEST(CppAPITests, TestCollectionListInput) {
   // auto trt_out = trt_mod.forward(complex_inputs_list);
 
   // std::cout << out.toTensor() << std::endl;
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+  
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5));
 }
\ No newline at end of file
diff --git a/tests/py/test_collection.py b/tests/py/test_collection.py
index 91eca4c854..41c074e6f7 100644
--- a/tests/py/test_collection.py
+++ b/tests/py/test_collection.py
@@ -59,6 +59,25 @@ def forward(self, z: List[torch.Tensor]):
         r = z[0] + z[1]
         return r
 
+class TupleInputOutput(nn.Module):
+    def __init__(self):
+        super(TupleInputOutput, self).__init__()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r = (r1, r2)
+        return r
+
+class ListInputOutput(nn.Module):
+    def __init__(self):
+        super(ListInputOutput, self).__init__()
+
+    def forward(self, z: List[torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r = [r1, r2]
+        return r
 
 input_data = torch.randn((16, 3, 32, 32))
 input_data = input_data.float().to("cuda")
@@ -82,4 +101,18 @@ def forward(self, z: List[torch.Tensor]):
 print(list_input_ts.graph)
 result = list_input_ts([input_data, input_data])
 list_input_ts.to("cuda").eval()
-torch.jit.save(list_input_ts, "./list_input.ts")
\ No newline at end of file
+torch.jit.save(list_input_ts, "./list_input.ts")
+
+tuple_input = TupleInputOutput()
+tuple_input_ts = torch.jit.script(tuple_input)
+print(tuple_input_ts.graph)
+result = tuple_input_ts((input_data, input_data))
+tuple_input_ts.to("cuda").eval()
+torch.jit.save(tuple_input_ts, "./tuple_input_output.ts")
+
+list_input = ListInputOutput()
+list_input_ts = torch.jit.script(list_input)
+print(list_input_ts.graph)
+result = list_input_ts([input_data, input_data])
+list_input_ts.to("cuda").eval()
+torch.jit.save(list_input_ts, "./list_input_output.ts")
\ No newline at end of file

From 6733cfb8d0c1302beb655aec9ce7cfcf7202f756 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 17 Mar 2022 17:33:11 +0800
Subject: [PATCH 08/22] feat: [collection] add unit test for complex collection
 model

Signed-off-by: inocsin <vcheungyi@163.com>
---
 tests/cpp/test_collection.cpp | 78 +++++++++++++++++++++++++++++++++++
 tests/py/test_collection.py   | 25 ++++++++++-
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index 0533b7ae77..d7948b1a6c 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -353,4 +353,82 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
   
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5));
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5));
+}
+
+
+TEST(CppAPITests, TestCollectionComplexModel) {
+
+  std::string path =
+  "/root/Torch-TensorRT/complex_model.ts";
+  torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
+  std::vector<at::Tensor> inputs;
+  inputs.push_back(in0);
+
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+  
+
+  std::vector<torch::jit::IValue> inputs_;
+
+  for (auto in : inputs) {
+    inputs_.push_back(torch::jit::IValue(in.clone()));
+  }
+
+  std::vector<torch::jit::IValue> complex_inputs;
+  auto input_list = c10::impl::GenericList(c10::TensorType::get());
+  input_list.push_back(inputs_[0]);
+  input_list.push_back(inputs_[0]);
+
+  torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
+
+  complex_inputs.push_back(input_list_ivalue);
+
+
+  auto out = mod.forward(complex_inputs);
+  LOG_DEBUG("Finish torchscirpt forward");
+
+
+  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
+  auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
+
+  auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
+
+
+  c10::TypePtr elementType = input_shape_ivalue.type();
+  auto list = c10::impl::GenericList(elementType);
+  list.push_back(input_shape_ivalue);
+  list.push_back(input_shape_ivalue);
+
+
+  torch::jit::IValue complex_input_shape(list);
+  std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
+  torch::jit::IValue complex_input_shape2(input_tuple2);
+
+  auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
+  compile_settings.require_full_compilation = false;
+  compile_settings.min_block_size = 1;
+
+  // Need to skip the conversion of __getitem__ and ListConstruct
+  compile_settings.torch_executed_ops.push_back("aten::__getitem__");
+  compile_settings.torch_executed_ops.push_back("prim::ListConstruct");
+
+  // // FP16 execution
+  compile_settings.enabled_precisions = {torch::kHalf};
+  // // Compile module
+  auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
+  LOG_DEBUG("Finish compile");
+  auto trt_out = trt_mod.forward(complex_inputs);
+  // auto trt_out = trt_mod.forward(complex_inputs_list);
+
+  // std::cout << out.toTuple()->elements()[0].toTensor() << std::endl;
+  
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
 }
\ No newline at end of file
diff --git a/tests/py/test_collection.py b/tests/py/test_collection.py
index 41c074e6f7..d23a12b349 100644
--- a/tests/py/test_collection.py
+++ b/tests/py/test_collection.py
@@ -79,6 +79,22 @@ def forward(self, z: List[torch.Tensor]):
         r = [r1, r2]
         return r
 
+class ComplexModel(nn.Module):
+    def __init__(self):
+        super(ComplexModel, self).__init__()
+        self.list_model = ListInputOutput()
+        self.tuple_model = TupleInputOutput()
+
+    def forward(self, z: List[torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r3 = (r1, r2)
+        r4 = [r2, r1]
+        tuple_out = self.tuple_model(r3)
+        list_out = self.list_model(r4)
+        r = (tuple_out[1], list_out[0])
+        return r
+
 input_data = torch.randn((16, 3, 32, 32))
 input_data = input_data.float().to("cuda")
 
@@ -115,4 +131,11 @@ def forward(self, z: List[torch.Tensor]):
 print(list_input_ts.graph)
 result = list_input_ts([input_data, input_data])
 list_input_ts.to("cuda").eval()
-torch.jit.save(list_input_ts, "./list_input_output.ts")
\ No newline at end of file
+torch.jit.save(list_input_ts, "./list_input_output.ts")
+
+complex_model = ComplexModel()
+complex_model_ts = torch.jit.script(complex_model)
+print(complex_model_ts.graph)
+result = complex_model_ts([input_data, input_data])
+complex_model_ts.to("cuda").eval()
+torch.jit.save(complex_model_ts, "./complex_model.ts")
\ No newline at end of file

From d21b0ab143413b97af6a43233d2f156984bc2878 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 17 Mar 2022 18:44:36 +0800
Subject: [PATCH 09/22] chore: [collection] delete comments

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.cpp                           | 16 ++--
 core/ir/ir.cpp                              | 35 +--------
 core/ir/ir.h                                | 11 ---
 core/partitioning/shape_analysis.cpp        | 24 +-----
 cpp/include/torch_tensorrt/torch_tensorrt.h |  1 -
 tests/cpp/test_collection.cpp               | 85 ++-------------------
 tests/py/test_collection.py                 | 31 --------
 7 files changed, 19 insertions(+), 184 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 1c8ed34762..d16796bd8e 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -307,14 +307,14 @@ void MapInputsAndDetermineDTypes(
     ir::StaticParams& static_params,
     ir::CollectionTypeMap& first_use_type_map) {
     // ir::TypeMap& first_use_type_map) {
-  // Associate input specs with inputs
-  // cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
-  cfg.convert_info.collection_inputs = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
-
-  auto collection_inputs = ir::get_collection_inputs(g, static_params);
-  LOG_DEBUG("In MapInputsAndDetermineDTypes " << "g->inputs() size " << g->inputs().size() << ", collection_inputs size " << collection_inputs.size());
-  // for (auto& in : g->inputs()) {
-  //   if (static_params.find(in) == static_params.end()) {
+    // Associate input specs with inputs
+    // cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
+    cfg.convert_info.collection_inputs = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
+
+    auto collection_inputs = ir::get_collection_inputs(g, static_params);
+    LOG_DEBUG("In MapInputsAndDetermineDTypes " << "g->inputs() size " << g->inputs().size() << ", collection_inputs size " << collection_inputs.size());
+    // for (auto& in : g->inputs()) {
+    //   if (static_params.find(in) == static_params.end()) {
     for (auto in : collection_inputs) {
       std::vector<ir::Input>& spec = cfg.convert_info.collection_inputs.find(in)->second;
       // ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index 52bd92a17f..bbc8239097 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -67,28 +67,6 @@ std::vector<const torch::jit::Value*> get_tensor_inputs(
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in);
     } 
-    // else if (in->type()->cast<c10::TupleType>() && static_params.find(in) == static_params.end()) {
-    // // } else if (in->type()->isSubtypeOf(c10::TupleType::create()) && static_params.find(in) == static_params.end()) {
-    //   at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(in);
-    //   LOG_DEBUG("Tuple size " << unpack_tuple.size());
-    //   for (auto item: unpack_tuple) {
-    //     input_tensors.push_back(in);
-    //   }
-    // } else if (in->type()->isSubtypeOf(c10::ListType::ofTensors()) && static_params.find(in) == static_params.end()) {
-      
-    //   LOG_DEBUG("List use size " << in->uses().size());
-    //   // for (auto use : in->uses()) {
-    //   //   LOG_DEBUG(use.user->outputs()[0]->debugName());
-    //   // }
-    //   // TODO: set the correct list number according to the Input IValue
-    //   int n = 2;
-    //   auto unpack_node = g->createListUnpack(in, n);
-    //   g->block()->appendNode(unpack_node);
-    //   for (auto item: unpack_node->outputs()) {
-    //     input_tensors.push_back(item);
-    //   }
-    //   LOG_DEBUG("Unpack List of size " << n);
-    // }
   }
   return input_tensors;
 }
@@ -101,11 +79,6 @@ std::vector<const torch::jit::Value*> get_collection_inputs(
   LOG_DEBUG("get_collection_inputs, inputs size " << inputs.size());
   for (auto in : inputs) {
     LOG_DEBUG("input debug name: " << in->debugName());
-    // Disregarding inputs that are not tensors or are static
-    //
-    // Ex.
-    // self.1:__torch__.alexnet -> ignored
-    // input.1:Tensor -> used
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in);
     } else if (in->type()->kind() == torch::jit::TypeKind::TupleType && static_params.find(in) == static_params.end()) {
@@ -246,6 +219,7 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
     if (i->type() == c10::TensorType::get()) {
       torch::jit::Value* in = i;
       types.insert({in, {get_value_first_calc_dtype_opt(b, i)}});
+
     } else if(i->type()->kind() == torch::jit::TypeKind::TupleType) {
       LOG_DEBUG("get_block_first_calc_dtypes_opt_collection TupleType");
       // TODO: to evaluate the data type of tuple element
@@ -253,14 +227,10 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
       c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
       LOG_DEBUG("get_block_first_calc_dtypes_opt_collection: tuple size " << unpack_tuple.size());
-      // Assume all tuple has the same datatype
+      // TODO: calculate the tuple element type
       // std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size(), tp);
       std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size());
       types.insert({i, dytpes}); // insert an empty 
-      // for (auto item: unpack_tuple) {
-      //   torch::jit::Value* in = item;
-      //   types.insert({in, get_value_first_calc_dtype_opt(b, i)});
-      // }
 
     } else if(i->type()->kind() == torch::jit::TypeKind::ListType) {
       // TODO: to decide the size of list and type of list element
@@ -269,7 +239,6 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
       // std::vector<c10::optional<at::ScalarType>> dytpes(i->uses().size());
       std::vector<c10::optional<at::ScalarType>> dytpes(i->uses().size(), tp);
       types.insert({i, dytpes}); // insert an empty
-
     }
   }
   return types;
diff --git a/core/ir/ir.h b/core/ir/ir.h
index 69c70263ed..a66aaf7d33 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -12,8 +12,6 @@ namespace core {
 namespace ir {
 
 struct Input : torch::CustomClassHolder {
-  // Input(std::vector<int64_t> shape);
-  // Input(std::vector<int64_t> min_shape, std::vector<int64_t> opt_shape, std::vector<int64_t> max_shape);
   Input() {};
   Input(
       std::vector<int64_t> shape,
@@ -42,15 +40,6 @@ struct Input : torch::CustomClassHolder {
 
 // Add to spec
 struct GraphInputs {
-//   GraphInputs() {}
-//   GraphInputs(torch::jit::IValue inputs) {
-//     input_signature = inputs;
-//     // TODO flatten IValue
-//   }
-  // GraphInputs(std::vector<Input> inputs) {
-  //   flattened_inputs = inputs;
-  //   // TODO construct the IValue
-  // }
   torch::jit::IValue input_signature;  // nested Input, full input spec
   std::vector<Input> flattened_inputs;  // flattend Input
   std::vector<std::vector<Input>> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e)
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index 6d69275e3a..1d330cc3d8 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -56,34 +56,18 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
 
     if (input.first->type()->kind() == torch::jit::TypeKind::ListType) {
       // create list
-      // auto list = c10::impl::GenericList(c10::TensorType::get());
-      // list.append(ivalues_maps[input]);
       LOG_DEBUG("generateRandomInputs, generate random input of list type");
-      // jit_inputs_ivalues.push_back(ivalues_maps[input].toList());
       std::vector<torch::jit::IValue> list;
       c10::TypePtr elementType = c10::TensorType::get();
       auto generic_list = c10::impl::GenericList(elementType);
-      LOG_DEBUG("generateRandomInputs, 0");
       for (int i = 0; i < input.second.size(); i++) {
-        // types for list is {}
-        // auto in = generateSingleInput(input.second[i], types[input.first][i]);
-        // TODO: need to decide the input type of list elements in ir.cpp
-        // c10::optional<at::ScalarType> type_opt = {};
-        // auto in = generateSingleInput(input.second[i], type_opt);
         auto in = generateSingleInput(input.second[i], types[input.first][i]);
-        // list.push_back(in.clone());
         generic_list.push_back(in.clone());
-        LOG_DEBUG("generateRandomInputs, 1");
       }
-      // c10::TypePtr elementType = list[0].type();
-      LOG_DEBUG("generateRandomInputs, 2");
-      // generic_list.append(list);
       ivalue_map[input.first] = c10::IValue(generic_list);
-      // jit_inputs_ivalues.push_back(list);
       LOG_DEBUG("generateRandomInputs, finish generate random input of list type");
     } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
-      // auto tuple = torch::jit::Tuple::create(ivalues_maps[input]);
       LOG_DEBUG("generateRandomInputs, generate random input of tuple type");
       std::vector<torch::jit::IValue> list;
       for (int i = 0; i < input.second.size(); i++) {
@@ -91,9 +75,7 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
         list.push_back(in.clone());
       }
       auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr
-      
       ivalue_map[input.first] = c10::IValue(tuple);
-      // jit_inputs_ivalues.push_back(tuple);
     } else {
       LOG_DEBUG("generateRandomInputs, generate random input of tensor type");
       auto in = generateSingleInput(input.second[0], types[input.first][0]);
@@ -152,14 +134,10 @@ void getSegmentsOutputByRunning(
       jit_inputs_ivalues.push_back(ivalues_maps[input].toBool());
     } else if (input->type()->kind() == torch::jit::TypeKind::ListType) {
       // create list
-      // auto list = c10::impl::GenericList(c10::TensorType::get());
-      // list.append(ivalues_maps[input]);
       LOG_DEBUG("getSegmentsOutputByRunning, handle list type");
-      jit_inputs_ivalues.push_back(ivalues_maps[input].toList());
-      // jit_inputs_ivalues.push_back(list);
+      jit_inputs_ivalues.push_back(ivalues_maps[input].toList());;
     } else if (input->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
-      // auto tuple = torch::jit::Tuple::create(ivalues_maps[input]);
       LOG_DEBUG("getSegmentsOutputByRunning, handle tuple type");
       jit_inputs_ivalues.push_back(ivalues_maps[input].toTuple());
     } else if (input->type()->kind() == torch::jit::TypeKind::NumberType) {
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
index 1ee8dde3c9..6da7534987 100644
--- a/cpp/include/torch_tensorrt/torch_tensorrt.h
+++ b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -521,7 +521,6 @@ struct TORCHTRT_API Input : torch::CustomClassHolder{
  */
 struct TORCHTRT_API GraphInputs {
   torch::jit::IValue input_signature;   // nested Input, full input spec
-  // std::vector<Input> flattened_inputs;  // flattend Input
 };
 
 /**
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index d7948b1a6c..f647af2c8c 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -8,8 +8,7 @@
 
 TEST(CppAPITests, TestCollectionNormalInput) {
 
-  std::string path =
-  "/root/Torch-TensorRT/normal_model.ts";
+  std::string path = "/root/Torch-TensorRT/normal_model.ts";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -54,12 +53,8 @@ TEST(CppAPITests, TestCollectionNormalInput) {
 
 TEST(CppAPITests, TestCollectionTupleInput) {
 
-  std::string path =
-  "/root/Torch-TensorRT/tuple_input.ts";
-  // torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  std::string path = "/root/Torch-TensorRT/tuple_input.ts";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
-  // std::vector<at::Tensor> inputs;
-  // inputs.push_back(in0);
 
   torch::jit::Module mod;
   try {
@@ -71,45 +66,25 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   mod.eval();
   mod.to(torch::kCUDA);
 
-
-  // std::vector<torch::jit::IValue> inputs_;
-
-  // for (auto in : inputs) {
-  //   inputs_.push_back(torch::jit::IValue(in.clone()));
-  // }
-
-
   std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
-  // std::vector<torch::jit::IValue> tuple;
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_tuple(in0, in0);
-  // auto input_list = c10::impl::GenericList(c10::TensorType::get());
-  // input_list.push_back(inputs_[0]);
-  // input_list.push_back(inputs_[0]);
-
-  // torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
 
   complex_inputs.push_back(input_tuple);
 
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
-  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
 
-  // c10::TypePtr elementType = input_shape_ivalue.type();
-  // auto list = c10::impl::GenericList(elementType);
-  // list.push_back(input_shape_ivalue);
-  // list.push_back(input_shape_ivalue);
-
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
 
   torch::jit::IValue complex_input_shape(input_shape_tuple);
   std::tuple<torch::jit::IValue> input_tuple2(complex_input_shape);
   torch::jit::IValue complex_input_shape2(input_tuple2);
-  // torch::jit::IValue complex_input_shape(list);
+
 
   auto compile_settings = torch_tensorrt::ts::CompileSpec(complex_input_shape2);
   compile_settings.require_full_compilation = false;
@@ -121,7 +96,6 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
-  // std::cout << out.toTensor() << std::endl;
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
@@ -129,8 +103,7 @@ TEST(CppAPITests, TestCollectionTupleInput) {
 
 TEST(CppAPITests, TestCollectionListInput) {
 
-  std::string path =
-  "/root/Torch-TensorRT/list_input.ts";
+  std::string path = "/root/Torch-TensorRT/list_input.ts";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -165,13 +138,9 @@ TEST(CppAPITests, TestCollectionListInput) {
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
-
-  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
-
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
-
   c10::TypePtr elementType = input_shape_ivalue.type();
   auto list = c10::impl::GenericList(elementType);
   list.push_back(input_shape_ivalue);
@@ -193,21 +162,16 @@ TEST(CppAPITests, TestCollectionListInput) {
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
-  // auto trt_out = trt_mod.forward(complex_inputs_list);
 
-  // std::cout << out.toTensor() << std::endl;
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
 }
 
 
 TEST(CppAPITests, TestCollectionTupleInputOutput) {
 
-  std::string path =
-  "/root/Torch-TensorRT/tuple_input_output.ts";
-  // torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kFloat);
+  std::string path = "/root/Torch-TensorRT/tuple_input_output.ts";
+
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
-  // std::vector<at::Tensor> inputs;
-  // inputs.push_back(in0);
 
   torch::jit::Module mod;
   try {
@@ -220,38 +184,19 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   mod.to(torch::kCUDA);
 
 
-  // std::vector<torch::jit::IValue> inputs_;
-
-  // for (auto in : inputs) {
-  //   inputs_.push_back(torch::jit::IValue(in.clone()));
-  // }
-
-
   std::vector<torch::jit::IValue> complex_inputs, complex_inputs_list;
-  // std::vector<torch::jit::IValue> tuple;
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_tuple(in0, in0);
-  // auto input_list = c10::impl::GenericList(c10::TensorType::get());
-  // input_list.push_back(inputs_[0]);
-  // input_list.push_back(inputs_[0]);
-
-  // torch::jit::IValue input_list_ivalue = torch::jit::IValue(input_list);
 
   complex_inputs.push_back(input_tuple);
 
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
-  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
 
-  // c10::TypePtr elementType = input_shape_ivalue.type();
-  // auto list = c10::impl::GenericList(elementType);
-  // list.push_back(input_shape_ivalue);
-  // list.push_back(input_shape_ivalue);
-
   std::tuple<torch::jit::IValue, torch::jit::IValue> input_shape_tuple(input_shape_ivalue, input_shape_ivalue);
 
   torch::jit::IValue complex_input_shape(input_shape_tuple);
@@ -271,7 +216,6 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
-  // std::cout << out.toTensor() << std::endl;
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
@@ -280,8 +224,7 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
 
 TEST(CppAPITests, TestCollectionListInputOutput) {
 
-  std::string path =
-  "/root/Torch-TensorRT/list_input_output.ts";
+  std::string path = "/root/Torch-TensorRT/list_input_output.ts";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -316,8 +259,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
-
-  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
@@ -347,9 +288,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
-  // auto trt_out = trt_mod.forward(complex_inputs_list);
-
-  // std::cout << out.toTensor() << std::endl;
   
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[0].toTensor(), trt_out.toList().vec()[0].toTensor(), 1e-5));
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toList().vec()[1].toTensor(), trt_out.toList().vec()[1].toTensor(), 1e-5));
@@ -358,8 +296,7 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
 
 TEST(CppAPITests, TestCollectionComplexModel) {
 
-  std::string path =
-  "/root/Torch-TensorRT/complex_model.ts";
+  std::string path = "/root/Torch-TensorRT/complex_model.ts";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -394,13 +331,10 @@ TEST(CppAPITests, TestCollectionComplexModel) {
   auto out = mod.forward(complex_inputs);
   LOG_DEBUG("Finish torchscirpt forward");
 
-
-  // auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kUnknown);
   auto input_shape = torch_tensorrt::Input(in0.sizes(), torch_tensorrt::DataType::kHalf);
 
   auto input_shape_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::Input>(input_shape)));
 
-
   c10::TypePtr elementType = input_shape_ivalue.type();
   auto list = c10::impl::GenericList(elementType);
   list.push_back(input_shape_ivalue);
@@ -425,9 +359,6 @@ TEST(CppAPITests, TestCollectionComplexModel) {
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
-  // auto trt_out = trt_mod.forward(complex_inputs_list);
-
-  // std::cout << out.toTuple()->elements()[0].toTensor() << std::endl;
   
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
diff --git a/tests/py/test_collection.py b/tests/py/test_collection.py
index d23a12b349..e71d754505 100644
--- a/tests/py/test_collection.py
+++ b/tests/py/test_collection.py
@@ -4,37 +4,6 @@
 import torch.nn.functional as F
 from typing import Tuple, List, Dict
 
-# class Model1(nn.Module):
-#     def __init__(self):
-#         super(Model1, self).__init__()
-
-#     def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
-#         r = z[1] + z[0]
-#         return r, z[1]
-
-
-# class TestModel1(nn.Module):
-#     def __init__(self):
-#         super(TestModel, self).__init__()
-#         self.model1 = Model1()
-
-#     def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
-#         r2, r1  = self.model1((z[0], z[1]))
-#         # unsupport ops
-#         i = r2.size(1)
-#         j = r2.size(2)
-# #         r3 = torch.tensor(i) * torch.tensor(j)
-#         r3 = r2[0,0,0,0]
-#         k = int(r3) - 5
-
-# #         if k > 0:
-#         r = r1 - k
-#         result = (r, r1)
-# #         else:
-# #             r = r1 - k
-# #             result = (r1, r)
-#         return result
-
 class Normal(nn.Module):
     def __init__(self):
         super(Normal, self).__init__()

From eada66db06727bef0f7fce705a23fb96729defe5 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 31 Mar 2022 19:03:27 +0800
Subject: [PATCH 10/22] chore: [collection] update code and comments

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.cpp                    | 23 ++++---------------
 core/ir/ir.cpp                       | 21 ++++++------------
 core/lowering/lowering.cpp           |  1 -
 core/partitioning/shape_analysis.cpp | 33 +++-------------------------
 cpp/src/compile_spec.cpp             |  8 +++----
 5 files changed, 18 insertions(+), 68 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index d16796bd8e..1d97139041 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -306,19 +306,13 @@ void MapInputsAndDetermineDTypes(
     std::shared_ptr<torch::jit::Graph>& g,
     ir::StaticParams& static_params,
     ir::CollectionTypeMap& first_use_type_map) {
-    // ir::TypeMap& first_use_type_map) {
-    // Associate input specs with inputs
-    // cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
     cfg.convert_info.collection_inputs = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
 
     auto collection_inputs = ir::get_collection_inputs(g, static_params);
-    LOG_DEBUG("In MapInputsAndDetermineDTypes " << "g->inputs() size " << g->inputs().size() << ", collection_inputs size " << collection_inputs.size());
-    // for (auto& in : g->inputs()) {
-    //   if (static_params.find(in) == static_params.end()) {
+    LOG_DEBUG("In MapInputsAndDetermineDTypes, the g->inputs() size is " << g->inputs().size() << ", CollectionInputSpecMap size is" << collection_inputs.size());
+
     for (auto in : collection_inputs) {
       std::vector<ir::Input>& spec = cfg.convert_info.collection_inputs.find(in)->second;
-      // ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
-      // c10::optional<at::ScalarType> est_type_opt = {};
       std::vector<c10::optional<at::ScalarType>> est_type_opt;
       
       auto est_it = first_use_type_map.find(in);
@@ -331,9 +325,8 @@ void MapInputsAndDetermineDTypes(
           // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
           // type
           LOG_INFO(
-              "Since input type is not explicitly defined, infering using first tensor calculation\n  Found input "
-              << in->debugName() << " has type " << est_type_opt[i].value()
-              << ". If this is incorrect explicitly set dtype for input and file a bug");
+              "Since input type is not explicitly defined, infering using first tensor calculation\n  Inferred input "
+              << in->debugName() << " has type " << est_type_opt[i].value());
           spec[i].dtype = util::ScalarTypeToTRTDataType(est_type_opt[i].value());
         } else if (!est_type_opt[i] && !spec[i].dtype_is_user_defined) {
           // If we cannot calculate the type and the user did not define the type, then default to FP32
@@ -344,12 +337,9 @@ void MapInputsAndDetermineDTypes(
         } else if (spec[i].dtype_is_user_defined && cfg.partition_info.enabled) {
           if (!est_type_opt[i]) {
             LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
-            // TODO set input data type
-
             std::stringstream ss;
             ss << "For input " << in->debugName() << ", found user specified input dtype as ";
             ss << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
-            // ss << cfg.convert_info.inputs.find(in)->second.dtype;
             ss << ". The compiler is going to use the user setting " << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
             auto warn_str = ss.str();
             LOG_WARNING(warn_str);
@@ -357,15 +347,12 @@ void MapInputsAndDetermineDTypes(
             first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype)};
 
           } else {
-            // if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
             if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype) != est_type_opt[i].value()) {
               std::stringstream ss;
               ss << "For input " << in->debugName() << ", found user specified input dtype as ";
               ss << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
-              // ss << cfg.convert_info.inputs.find(in)->second.dtype;
               ss << ", however when inspecting the graph, the input type expected was inferred to be ";
               ss << est_type_opt[i].value() << std::endl;
-              // ss << "The compiler is going to use the user setting " << cfg.convert_info.inputs.find(in)->second.dtype;
               ss << "The compiler is going to use the user setting " << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
               ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
               ss << "compatibility with PyTorch's data type convention is required.\n";
@@ -375,7 +362,6 @@ void MapInputsAndDetermineDTypes(
               auto warn_str = ss.str();
               LOG_WARNING(warn_str);
               // Overwrite type map with user settings
-              // first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
               first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype)};
             }
           }
@@ -447,7 +433,6 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       auto params = graph_and_parameters.second;
       auto static_params = ir::get_static_params(g->inputs(), params);
       // Infer the type of an input from the weights of the calculation
-      // auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
       auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
       MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index bbc8239097..5da2c121f6 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -22,7 +22,6 @@ CollectionInputSpecMap associate_specs_with_collection_inputs(
 }
 
 InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> vals, std::vector<Input> specs) {
-  LOG_DEBUG("pair_input_vals_with_specs");
   TORCHTRT_CHECK(
       vals.size() == specs.size(),
       "Expected dimension specifications for all input tensors"
@@ -37,7 +36,6 @@ InputSpecMap pair_input_vals_with_specs(std::vector<const torch::jit::Value*> va
 }
 
 CollectionInputSpecMap pair_input_vals_with_specs_collection(std::vector<const torch::jit::Value*> vals, std::vector<std::vector<Input>>& specs) {
-  LOG_DEBUG("pair_input_vals_with_specs collection");
   TORCHTRT_CHECK(
       vals.size() == specs.size(),
       "Expected dimension specifications for all input tensors"
@@ -56,9 +54,9 @@ std::vector<const torch::jit::Value*> get_tensor_inputs(
     StaticParams& static_params) {
   std::vector<const torch::jit::Value*> input_tensors;
   auto inputs = g->inputs();
-  LOG_DEBUG("Inputs size " << inputs.size());
+  LOG_DEBUG("Raw inputs size of get_tensor_inputs: " << inputs.size());
   for (auto in : inputs) {
-    LOG_DEBUG("input debug name: " << in->debugName());
+    LOG_DEBUG("Handle input of debug name: " << in->debugName());
     // Disregarding inputs that are not tensors or are static
     //
     // Ex.
@@ -76,9 +74,9 @@ std::vector<const torch::jit::Value*> get_collection_inputs(
     StaticParams& static_params) {
   std::vector<const torch::jit::Value*> input_tensors;
   auto inputs = g->inputs();
-  LOG_DEBUG("get_collection_inputs, inputs size " << inputs.size());
+  LOG_DEBUG("Raw inputs size of get_collection_inputs: " << inputs.size());
   for (auto in : inputs) {
-    LOG_DEBUG("input debug name: " << in->debugName());
+    LOG_DEBUG("Handle input of debug name: " << in->debugName());
     if (in->type()->isSubtypeOf(c10::TensorType::get()) && static_params.find(in) == static_params.end()) {
       input_tensors.push_back(in);
     } else if (in->type()->kind() == torch::jit::TypeKind::TupleType && static_params.find(in) == static_params.end()) {
@@ -101,12 +99,9 @@ c10::optional<at::ScalarType> get_value_first_calc_dtype_opt(torch::jit::Block*
   auto b_ins = b->inputs();
   std::unordered_set<torch::jit::Value*> b_in_set(b_ins.begin(), b_ins.end());
 
-  // TORCHTRT_ASSERT(
-  //     in->type() == c10::TensorType::get(), "Input is not a tensor, cannot check for dtype based on calculation");
-
   auto consumers = in->uses();
   auto search_list = std::vector<torch::jit::Use>(consumers.begin(), consumers.end());
-  LOG_DEBUG("Users number for " << in->debugName() << ": " << consumers.size());
+
   while(search_list.size() > 0) {
     // after insertion, original iterator will be invalid
     auto& u = search_list.front();
@@ -221,13 +216,11 @@ CollectionTypeMap get_block_first_calc_dtypes_opt_collection(torch::jit::Block*
       types.insert({in, {get_value_first_calc_dtype_opt(b, i)}});
 
     } else if(i->type()->kind() == torch::jit::TypeKind::TupleType) {
-      LOG_DEBUG("get_block_first_calc_dtypes_opt_collection TupleType");
       // TODO: to evaluate the data type of tuple element
       // make sure very time get the same ptr
-      c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
+      // c10::optional<at::ScalarType> tp = get_value_first_calc_dtype_opt(b, i);
       at::ArrayRef<torch::jit::Value*> unpack_tuple = torch::jit::createTupleUnpack(i);
-      LOG_DEBUG("get_block_first_calc_dtypes_opt_collection: tuple size " << unpack_tuple.size());
-      // TODO: calculate the tuple element type
+      // TODO: calculate the tuple element type, currently we use {} as default datatype
       // std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size(), tp);
       std::vector<c10::optional<at::ScalarType>> dytpes(unpack_tuple.size());
       types.insert({i, dytpes}); // insert an empty 
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
index 0051ad451c..8bbae296c3 100644
--- a/core/lowering/lowering.cpp
+++ b/core/lowering/lowering.cpp
@@ -33,7 +33,6 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   torch::jit::InlineFunctionalGraphs(g);
   torch::jit::PeepholeOptimize(g, false);
   torch::jit::FuseLinear(g);
-  // torch::jit::LowerAllTuples(g);
   if (!lower_info.disable_cse) {
     torch::jit::EliminateCommonSubexpression(g);
   }
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
index 1d330cc3d8..961831cb47 100644
--- a/core/partitioning/shape_analysis.cpp
+++ b/core/partitioning/shape_analysis.cpp
@@ -25,38 +25,17 @@ at::Tensor generateSingleInput(ir::Input& input, c10::optional<at::ScalarType>&
 }
 
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
-// std::unordered_map<const torch::jit::Value*, std::vector<torch::jit::IValue>> generateRandomInputs(
-    // std::unordered_map<const torch::jit::Value*, ir::Input>& inputs,
     std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& inputs,
-    // std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>& types) {
     std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types) {
+
   // generate random inputs for running pytorch segments
   std::unordered_map<const torch::jit::Value*, torch::jit::IValue> ivalue_map;
-  // std::unordered_map<const torch::jit::Value*, std::vector<torch::jit::IValue>> ivalue_map;
-  // TODO
-  // uint64_t in_i = 0;
-  for (auto& input : inputs) {
 
-    // for (int i = 0; i < input.second.size(); i++) {
-    //   auto cur_shape = input.second[i].input_shape;
-    //   std::vector<int64_t> shape;
-    //   shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
-    //   auto type_opt = types[input.first][i];
-    //   auto type = at::kFloat;
-    //   if (type_opt) {
-    //     type = type_opt.value();
-    //   } else {
-    //     LOG_WARNING("Input type for doing shape analysis could not be determined, defaulting to F32");
-    //   }
-    //   auto in = at::randint(5, shape, {at::kCUDA}).to(type);
-    //   // ivalue_map[input.first] = in.clone();
-    //   ivalue_map[input.first].push_back(in.clone());
-    //   // in_i++;
-    // }
+
+  for (auto& input : inputs) {
 
     if (input.first->type()->kind() == torch::jit::TypeKind::ListType) {
       // create list
-      LOG_DEBUG("generateRandomInputs, generate random input of list type");
       std::vector<torch::jit::IValue> list;
       c10::TypePtr elementType = c10::TensorType::get();
       auto generic_list = c10::impl::GenericList(elementType);
@@ -65,10 +44,8 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
         generic_list.push_back(in.clone());
       }
       ivalue_map[input.first] = c10::IValue(generic_list);
-      LOG_DEBUG("generateRandomInputs, finish generate random input of list type");
     } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
-      LOG_DEBUG("generateRandomInputs, generate random input of tuple type");
       std::vector<torch::jit::IValue> list;
       for (int i = 0; i < input.second.size(); i++) {
         auto in = generateSingleInput(input.second[i], types[input.first][i]);
@@ -77,7 +54,6 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
       auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr
       ivalue_map[input.first] = c10::IValue(tuple);
     } else {
-      LOG_DEBUG("generateRandomInputs, generate random input of tensor type");
       auto in = generateSingleInput(input.second[0], types[input.first][0]);
       ivalue_map[input.first] = in.clone();
       
@@ -89,7 +65,6 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
 void getSegmentsOutputByRunning(
     SegmentedBlock& seg_block,
     std::unordered_map<const torch::jit::Value*, torch::jit::IValue>& ivalues_maps,
-    // std::unordered_map<const torch::jit::Value*, std::vector<torch::jit::IValue>>& ivalues_maps,
     const PartitionInfo& partition_info) {
   // create a module to run the graph
   auto g = seg_block.g();
@@ -134,11 +109,9 @@ void getSegmentsOutputByRunning(
       jit_inputs_ivalues.push_back(ivalues_maps[input].toBool());
     } else if (input->type()->kind() == torch::jit::TypeKind::ListType) {
       // create list
-      LOG_DEBUG("getSegmentsOutputByRunning, handle list type");
       jit_inputs_ivalues.push_back(ivalues_maps[input].toList());;
     } else if (input->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
-      LOG_DEBUG("getSegmentsOutputByRunning, handle tuple type");
       jit_inputs_ivalues.push_back(ivalues_maps[input].toTuple());
     } else if (input->type()->kind() == torch::jit::TypeKind::NumberType) {
       jit_inputs_ivalues.push_back(ivalues_maps[input].toScalar());
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 5e015e3a6e..ef000506c7 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -81,7 +81,7 @@ void flatten_dfs(std::vector<torchtrt::core::ir::Input>& flattened_inputs, std::
       } else if (level == 2) {  // like A in [(A, A), C]
         collection_inputs[index].push_back(cur_input);
       } else {// only support 2 level
-        LOG_ERROR("3 level of input specs is not supported");
+        LOG_ERROR("Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]");
       }
     }
 }
@@ -99,7 +99,7 @@ torch_tensorrt::core::ir::GraphInputs to_internal_graph_inputs(GraphInputs exter
     internal_graph_input.input_signature = converted_input_signature;
     internal_graph_input.collection_inputs = collection_inputs;
 
-    LOG_DEBUG("compile_spec.cpp, to_internal_graph_inputs, flattened_inputs size " << flattened_inputs.size() << ", collection_inputs size "<< collection_inputs.size());
+    LOG_DEBUG("Convert external_graph_input to internal_graph_inputs, total input input spec number: " << flattened_inputs.size() << ", top level input spec number "<< collection_inputs.size());
 
   return internal_graph_input;
 }
@@ -107,11 +107,11 @@ torch_tensorrt::core::ir::GraphInputs to_internal_graph_inputs(GraphInputs exter
 torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
   torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.inputs));
   if (internal.inputs.size() == 0) {
-    LOG_DEBUG("to_internal_compile_spec, Input size == 0, using graph_input");
+    LOG_DEBUG("GraphInput.inputs size == 0, using GraphInput.input_signature to get Input spec");
     internal.graph_inputs = to_internal_graph_inputs(external.graph_inputs);
     internal.inputs = internal.graph_inputs.flattened_inputs;
   } else {
-    LOG_DEBUG("to_internal_compile_spec, Input size != 0, using original Input to construct collection_input");
+    LOG_DEBUG("GraphInput.inputs size != 0, using GraphInput.inputs to get Input spec");
     internal.graph_inputs.collection_inputs.resize(internal.inputs.size());
     for (int i = 0; i < internal.inputs.size(); i++) {
       internal.graph_inputs.collection_inputs[i].push_back(internal.inputs[i]);

From 633c00f9c122196cd2d4e567e3d4d6fbd64cd7c6 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 31 Mar 2022 19:13:20 +0800
Subject: [PATCH 11/22] chore: [collection] rename
 ConversionInfo.collection_inputs to ConversionInfo.collection_input_spec_map

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.cpp              | 20 ++++++++++----------
 core/compiler.h                |  6 +-----
 core/conversion/conversion.cpp |  4 +---
 core/conversion/conversion.h   |  2 +-
 4 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 1d97139041..57b4667bce 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -306,13 +306,13 @@ void MapInputsAndDetermineDTypes(
     std::shared_ptr<torch::jit::Graph>& g,
     ir::StaticParams& static_params,
     ir::CollectionTypeMap& first_use_type_map) {
-    cfg.convert_info.collection_inputs = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
+    cfg.convert_info.collection_input_spec_map = std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
 
     auto collection_inputs = ir::get_collection_inputs(g, static_params);
     LOG_DEBUG("In MapInputsAndDetermineDTypes, the g->inputs() size is " << g->inputs().size() << ", CollectionInputSpecMap size is" << collection_inputs.size());
 
     for (auto in : collection_inputs) {
-      std::vector<ir::Input>& spec = cfg.convert_info.collection_inputs.find(in)->second;
+      std::vector<ir::Input>& spec = cfg.convert_info.collection_input_spec_map.find(in)->second;
       std::vector<c10::optional<at::ScalarType>> est_type_opt;
       
       auto est_it = first_use_type_map.find(in);
@@ -339,21 +339,21 @@ void MapInputsAndDetermineDTypes(
             LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
             std::stringstream ss;
             ss << "For input " << in->debugName() << ", found user specified input dtype as ";
-            ss << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
-            ss << ". The compiler is going to use the user setting " << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
+            ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
+            ss << ". The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
             auto warn_str = ss.str();
             LOG_WARNING(warn_str);
             // Overwrite type map with user settings
-            first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype)};
+            first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
 
           } else {
-            if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype) != est_type_opt[i].value()) {
+            if (util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype) != est_type_opt[i].value()) {
               std::stringstream ss;
               ss << "For input " << in->debugName() << ", found user specified input dtype as ";
-              ss << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
+              ss << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
               ss << ", however when inspecting the graph, the input type expected was inferred to be ";
               ss << est_type_opt[i].value() << std::endl;
-              ss << "The compiler is going to use the user setting " << cfg.convert_info.collection_inputs.find(in)->second[i].dtype;
+              ss << "The compiler is going to use the user setting " << cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype;
               ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
               ss << "compatibility with PyTorch's data type convention is required.\n";
               ss << "If you do indeed see errors at runtime either:\n";
@@ -362,7 +362,7 @@ void MapInputsAndDetermineDTypes(
               auto warn_str = ss.str();
               LOG_WARNING(warn_str);
               // Overwrite type map with user settings
-              first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_inputs.find(in)->second[i].dtype)};
+              first_use_type_map[in][i] = {util::TRTDataTypeToScalarType(cfg.convert_info.collection_input_spec_map.find(in)->second[i].dtype)};
             }
           }
         } else {
@@ -447,7 +447,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
           !(cfg.lower_info.forced_fallback_modules.size() == 0 &&
             cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
 
-        auto collection_input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.collection_inputs, first_use_types);
+        auto collection_input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types);
         auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), collection_input_ivalues_map, cfg, static_params);
         new_g = graph_and_mapping.first;
         LOG_INFO("Segmented Graph: " << *new_g);
diff --git a/core/compiler.h b/core/compiler.h
index 71aa8899b2..750cd59c8e 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -14,12 +14,8 @@ namespace torch_tensorrt {
 namespace core {
 
 struct CompileSpec {
-  CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {
-    // graph_inputs = ir::GraphInputs(inputs);
-  }
+  CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {}
   CompileSpec(torch::jit::IValue& input_signature) {
-    // graph_inputs = ir::GraphInputs(input_signature);
-    // inputs = graph_inputs.flattened_inputs;
     graph_inputs.input_signature = input_signature;
   }
   ir::GraphInputs graph_inputs;
diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index bafde231a1..3c9eb0dea4 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -135,9 +135,8 @@ void AddInputs(
     ConversionCtx* ctx,
     c10::ArrayRef<const torch::jit::Value*> inputs,
     ConversionInfo& conversion_info) {
-    // std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs) {
   std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs = conversion_info.inputs;
-  std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>> collection_input_spec = conversion_info.collection_inputs;
+  std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>> collection_input_spec = conversion_info.collection_input_spec_map;
   
   std::vector<const torch::jit::Value*> input_tensors;
   for (auto in : inputs) {
@@ -415,7 +414,6 @@ void ConvertBlockToNetDef(
 
   auto inputs = b->inputs();
   AddParamsToCtxValueMap(ctx, static_params);
-  // AddInputs(ctx, inputs, build_info.inputs);
   AddInputs(ctx, inputs, build_info);
 
   auto nodes = b->nodes();
diff --git a/core/conversion/conversion.h b/core/conversion/conversion.h
index ba194716e8..148d99ac13 100644
--- a/core/conversion/conversion.h
+++ b/core/conversion/conversion.h
@@ -13,7 +13,7 @@ namespace conversion {
 
 struct ConversionInfo {
   ir::InputSpecMap inputs;
-  ir::CollectionInputSpecMap collection_inputs;
+  ir::CollectionInputSpecMap collection_input_spec_map;
   BuilderSettings engine_settings;
 };
 

From 89665c8d67a99bbae1c559af83ae9a60eb0517a9 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 31 Mar 2022 19:40:11 +0800
Subject: [PATCH 12/22] refactor: [collection] fuse Input with GraphInputs

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.h                             |  5 ++--
 core/ir/ir.h                                |  2 +-
 cpp/include/torch_tensorrt/torch_tensorrt.h | 17 ++++---------
 cpp/src/compile_spec.cpp                    | 27 +++++++++++----------
 4 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/core/compiler.h b/core/compiler.h
index 750cd59c8e..85bc1d2c08 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -14,12 +14,13 @@ namespace torch_tensorrt {
 namespace core {
 
 struct CompileSpec {
-  CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {}
+  CompileSpec(std::vector<ir::Input> inputs) {
+    graph_inputs.inputs = inputs;
+  }
   CompileSpec(torch::jit::IValue& input_signature) {
     graph_inputs.input_signature = input_signature;
   }
   ir::GraphInputs graph_inputs;
-  std::vector<ir::Input> inputs; // can be replaced by graph_inputs
   conversion::ConversionInfo convert_info;
   lowering::LowerInfo lower_info;
   partitioning::PartitionInfo partition_info;
diff --git a/core/ir/ir.h b/core/ir/ir.h
index a66aaf7d33..c138ad693b 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -41,7 +41,7 @@ struct Input : torch::CustomClassHolder {
 // Add to spec
 struct GraphInputs {
   torch::jit::IValue input_signature;  // nested Input, full input spec
-  std::vector<Input> flattened_inputs;  // flattend Input
+  std::vector<Input> inputs;  // flattend Input
   std::vector<std::vector<Input>> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e)
 };
 
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
index 6da7534987..e19b9f1408 100644
--- a/cpp/include/torch_tensorrt/torch_tensorrt.h
+++ b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -517,10 +517,11 @@ struct TORCHTRT_API Input : torch::CustomClassHolder{
 /**
  * @brief A struct to hold complex inputs
  *
- * This struct can either hold a conplex inputs of shape or a flattened one,
+ * This struct can either hold a complex inputs of shape or a flattened one,
  */
 struct TORCHTRT_API GraphInputs {
-  torch::jit::IValue input_signature;   // nested Input, full input spec
+  torch::jit::IValue input_signature;  // nested Input, full input spec
+  std::vector<Input> inputs; // flatten input spec
 };
 
 /**
@@ -590,25 +591,17 @@ struct TORCHTRT_API CompileSpec {
    *
    * @param inputs
    */
-  CompileSpec(std::vector<Input> inputs) : inputs(std::move(inputs)) {}
+  CompileSpec(std::vector<Input> inputs);
 
   /**
    * @brief Construct a new Extra Info object from IValue.
    * The IValue store a complex Input
    *
-   * @param inputs
+   * @param input_signature
    */
   CompileSpec(torch::jit::IValue input_signature);
   // Defaults should reflect TensorRT defaults for BuilderConfig
 
-  /**
-   * @brief Specifications for inputs to the engine, can either be a single size or a range defined by min, opt and max
-   * sizes Users can also specify expected input type as well as tensor memory format
-   *
-   * Order in vector should match call order for the function
-   */
-  std::vector<Input> inputs;
-
   /**
    * @brief Specifications for inputs to the engine, can store a IValue which has stored complex Input
    *  or a flatened Input
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index ef000506c7..366476b227 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -18,22 +18,26 @@ torchtrt::core::runtime::CudaDevice to_internal_cuda_device(Device device);
 namespace torchscript {
 CompileSpec::CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes) {
   for (auto in : fixed_sizes) {
-    inputs.push_back(Input(in));
+    graph_inputs.inputs.push_back(Input(in));
   }
-  // graph_inputs.flattened_inputs = inputs;
 }
 
 CompileSpec::CompileSpec(std::vector<std::vector<int64_t>> fixed_sizes) {
   for (auto in : fixed_sizes) {
-    inputs.push_back(Input(in));
+    graph_inputs.inputs.push_back(Input(in));
   }
-  // graph_inputs.flattened_inputs = inputs;
+}
+
+CompileSpec::CompileSpec(std::vector<Input> inputs) {
+    graph_inputs.inputs = std::move(inputs);
 }
 
 CompileSpec::CompileSpec(torch::jit::IValue input_signature) {
     graph_inputs.input_signature = input_signature;
 }
 
+
+
 void flatten_dfs(std::vector<torchtrt::core::ir::Input>& flattened_inputs, std::vector<std::vector<torchtrt::core::ir::Input>>& collection_inputs, 
                  torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue, int level, int index) {
     if (input_ivalue.isTuple()) {
@@ -59,7 +63,6 @@ void flatten_dfs(std::vector<torchtrt::core::ir::Input>& flattened_inputs, std::
       }
       c10::TypePtr type = input_list[0].type();
       auto converted_elements = c10::impl::GenericList(type);
-      // std::vector<torch::jit::IValue> converted_elements;
       int idx = 0;
       for (auto item: input_list) {
         int cur_idx = level < 1 ? idx: index;
@@ -95,7 +98,7 @@ torch_tensorrt::core::ir::GraphInputs to_internal_graph_inputs(GraphInputs exter
 
     torch::jit::IValue converted_input_signature;
     flatten_dfs(flattened_inputs, collection_inputs, external_graph_input.input_signature, converted_input_signature, 0, 0);
-    internal_graph_input.flattened_inputs = flattened_inputs;
+    internal_graph_input.inputs = flattened_inputs;
     internal_graph_input.input_signature = converted_input_signature;
     internal_graph_input.collection_inputs = collection_inputs;
 
@@ -105,17 +108,15 @@ torch_tensorrt::core::ir::GraphInputs to_internal_graph_inputs(GraphInputs exter
 }
 
 torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
-  torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.inputs));
-  if (internal.inputs.size() == 0) {
+  torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.graph_inputs.inputs));
+  if (internal.graph_inputs.inputs.size() == 0) {
     LOG_DEBUG("GraphInput.inputs size == 0, using GraphInput.input_signature to get Input spec");
     internal.graph_inputs = to_internal_graph_inputs(external.graph_inputs);
-    internal.inputs = internal.graph_inputs.flattened_inputs;
   } else {
     LOG_DEBUG("GraphInput.inputs size != 0, using GraphInput.inputs to get Input spec");
-    internal.graph_inputs.collection_inputs.resize(internal.inputs.size());
-    for (int i = 0; i < internal.inputs.size(); i++) {
-      internal.graph_inputs.collection_inputs[i].push_back(internal.inputs[i]);
-      internal.graph_inputs.flattened_inputs = internal.inputs;
+    internal.graph_inputs.collection_inputs.resize(internal.graph_inputs.inputs.size());
+    for (int i = 0; i < internal.graph_inputs.inputs.size(); i++) {
+      internal.graph_inputs.collection_inputs[i].push_back(internal.graph_inputs.inputs[i]);
     }
   }
 

From 205452e95d97f35141670cb982888ebfa2273d63 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 31 Mar 2022 19:59:22 +0800
Subject: [PATCH 13/22] feat: [collection] move collection test model to hub.py

Signed-off-by: inocsin <vcheungyi@163.com>
---
 tests/modules/hub.py        |  95 +++++++++++++++++++++++++++++++
 tests/py/test_collection.py | 110 ------------------------------------
 2 files changed, 95 insertions(+), 110 deletions(-)
 delete mode 100644 tests/py/test_collection.py

diff --git a/tests/modules/hub.py b/tests/modules/hub.py
index 7b707f5785..f03658321c 100644
--- a/tests/modules/hub.py
+++ b/tests/modules/hub.py
@@ -3,7 +3,11 @@
 import torch.nn.functional as F
 import torchvision.models as models
 import timm
+<<<<<<< HEAD
 from transformers import BertModel, BertTokenizer, BertConfig
+=======
+from typing import Tuple, List, Dict
+>>>>>>> feat: [collection] move collection test model to hub.py
 
 torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
 
@@ -217,3 +221,94 @@ def forward(self, x):
 
 traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
 torch.jit.save(traced_model, "bert_base_uncased_traced.jit.pt")
+
+# Collection input/output models
+class Normal(nn.Module):
+    def __init__(self):
+        super(Normal, self).__init__()
+
+    def forward(self, x, y):
+        r = x + y
+        return r
+
+class TupleInput(nn.Module):
+    def __init__(self):
+        super(TupleInput, self).__init__()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r = z[0] + z[1]
+        return r
+
+class ListInput(nn.Module):
+    def __init__(self):
+        super(ListInput, self).__init__()
+
+    def forward(self, z: List[torch.Tensor]):
+        r = z[0] + z[1]
+        return r
+
+class TupleInputOutput(nn.Module):
+    def __init__(self):
+        super(TupleInputOutput, self).__init__()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r = (r1, r2)
+        return r
+
+class ListInputOutput(nn.Module):
+    def __init__(self):
+        super(ListInputOutput, self).__init__()
+
+    def forward(self, z: List[torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r = [r1, r2]
+        return r
+
+class ComplexModel(nn.Module):
+    def __init__(self):
+        super(ComplexModel, self).__init__()
+        self.list_model = ListInputOutput()
+        self.tuple_model = TupleInputOutput()
+
+    def forward(self, z: List[torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r3 = (r1, r2)
+        r4 = [r2, r1]
+        tuple_out = self.tuple_model(r3)
+        list_out = self.list_model(r4)
+        r = (tuple_out[1], list_out[0])
+        return r
+
+normal_model = Normal()
+normal_model_ts = torch.jit.script(normal_model)
+normal_model_ts.to("cuda").eval()
+torch.jit.save(normal_model_ts, "normal_model.ts")
+
+tuple_input = TupleInput()
+tuple_input_ts = torch.jit.script(tuple_input)
+tuple_input_ts.to("cuda").eval()
+torch.jit.save(tuple_input_ts, "tuple_input.ts")
+
+list_input = ListInput()
+list_input_ts = torch.jit.script(list_input)
+list_input_ts.to("cuda").eval()
+torch.jit.save(list_input_ts, "list_input.ts")
+
+tuple_input = TupleInputOutput()
+tuple_input_ts = torch.jit.script(tuple_input)
+tuple_input_ts.to("cuda").eval()
+torch.jit.save(tuple_input_ts, "tuple_input_output.ts")
+
+list_input = ListInputOutput()
+list_input_ts = torch.jit.script(list_input)
+list_input_ts.to("cuda").eval()
+torch.jit.save(list_input_ts, "list_input_output.ts")
+
+complex_model = ComplexModel()
+complex_model_ts = torch.jit.script(complex_model)
+complex_model_ts.to("cuda").eval()
+torch.jit.save(complex_model_ts, "complex_model.ts")
diff --git a/tests/py/test_collection.py b/tests/py/test_collection.py
deleted file mode 100644
index e71d754505..0000000000
--- a/tests/py/test_collection.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import torch
-import copy
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Tuple, List, Dict
-
-class Normal(nn.Module):
-    def __init__(self):
-        super(Normal, self).__init__()
-
-    def forward(self, x, y):
-        r = x + y
-        return r
-
-class TupleInput(nn.Module):
-    def __init__(self):
-        super(TupleInput, self).__init__()
-
-    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
-        r = z[0] + z[1]
-        return r
-
-class ListInput(nn.Module):
-    def __init__(self):
-        super(ListInput, self).__init__()
-
-    def forward(self, z: List[torch.Tensor]):
-        r = z[0] + z[1]
-        return r
-
-class TupleInputOutput(nn.Module):
-    def __init__(self):
-        super(TupleInputOutput, self).__init__()
-
-    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
-        r1 = z[0] + z[1]
-        r2 = z[0] - z[1]
-        r = (r1, r2)
-        return r
-
-class ListInputOutput(nn.Module):
-    def __init__(self):
-        super(ListInputOutput, self).__init__()
-
-    def forward(self, z: List[torch.Tensor]):
-        r1 = z[0] + z[1]
-        r2 = z[0] - z[1]
-        r = [r1, r2]
-        return r
-
-class ComplexModel(nn.Module):
-    def __init__(self):
-        super(ComplexModel, self).__init__()
-        self.list_model = ListInputOutput()
-        self.tuple_model = TupleInputOutput()
-
-    def forward(self, z: List[torch.Tensor]):
-        r1 = z[0] + z[1]
-        r2 = z[0] - z[1]
-        r3 = (r1, r2)
-        r4 = [r2, r1]
-        tuple_out = self.tuple_model(r3)
-        list_out = self.list_model(r4)
-        r = (tuple_out[1], list_out[0])
-        return r
-
-input_data = torch.randn((16, 3, 32, 32))
-input_data = input_data.float().to("cuda")
-
-normal_model = Normal()
-normal_model_ts = torch.jit.script(normal_model)
-print(normal_model_ts.graph)
-result = normal_model_ts(input_data, input_data)
-normal_model_ts.to("cuda").eval()
-torch.jit.save(normal_model_ts, "./normal_model.ts")
-
-tuple_input = TupleInput()
-tuple_input_ts = torch.jit.script(tuple_input)
-print(tuple_input_ts.graph)
-result = tuple_input_ts((input_data, input_data))
-tuple_input_ts.to("cuda").eval()
-torch.jit.save(tuple_input_ts, "./tuple_input.ts")
-
-list_input = ListInput()
-list_input_ts = torch.jit.script(list_input)
-print(list_input_ts.graph)
-result = list_input_ts([input_data, input_data])
-list_input_ts.to("cuda").eval()
-torch.jit.save(list_input_ts, "./list_input.ts")
-
-tuple_input = TupleInputOutput()
-tuple_input_ts = torch.jit.script(tuple_input)
-print(tuple_input_ts.graph)
-result = tuple_input_ts((input_data, input_data))
-tuple_input_ts.to("cuda").eval()
-torch.jit.save(tuple_input_ts, "./tuple_input_output.ts")
-
-list_input = ListInputOutput()
-list_input_ts = torch.jit.script(list_input)
-print(list_input_ts.graph)
-result = list_input_ts([input_data, input_data])
-list_input_ts.to("cuda").eval()
-torch.jit.save(list_input_ts, "./list_input_output.ts")
-
-complex_model = ComplexModel()
-complex_model_ts = torch.jit.script(complex_model)
-print(complex_model_ts.graph)
-result = complex_model_ts([input_data, input_data])
-complex_model_ts.to("cuda").eval()
-torch.jit.save(complex_model_ts, "./complex_model.ts")
\ No newline at end of file

From a4d4131d78be021589dc825d9f107a119eb1346e Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 31 Mar 2022 20:14:45 +0800
Subject: [PATCH 14/22] test: [collection] update model path in
 test_collection.cpp

Signed-off-by: inocsin <vcheungyi@163.com>
---
 tests/cpp/test_collection.cpp | 12 ++++++------
 tests/modules/hub.py          | 15 ++++++---------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index f647af2c8c..73bcabcf13 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -8,7 +8,7 @@
 
 TEST(CppAPITests, TestCollectionNormalInput) {
 
-  std::string path = "/root/Torch-TensorRT/normal_model.ts";
+  std::string path = "tests/modules/normal_model.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -53,7 +53,7 @@ TEST(CppAPITests, TestCollectionNormalInput) {
 
 TEST(CppAPITests, TestCollectionTupleInput) {
 
-  std::string path = "/root/Torch-TensorRT/tuple_input.ts";
+  std::string path = "tests/modules/tuple_input.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
 
   torch::jit::Module mod;
@@ -103,7 +103,7 @@ TEST(CppAPITests, TestCollectionTupleInput) {
 
 TEST(CppAPITests, TestCollectionListInput) {
 
-  std::string path = "/root/Torch-TensorRT/list_input.ts";
+  std::string path = "tests/modules/list_input.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -169,7 +169,7 @@ TEST(CppAPITests, TestCollectionListInput) {
 
 TEST(CppAPITests, TestCollectionTupleInputOutput) {
 
-  std::string path = "/root/Torch-TensorRT/tuple_input_output.ts";
+  std::string path = "tests/modules/tuple_input_output.jit.pt";
 
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
 
@@ -224,7 +224,7 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
 
 TEST(CppAPITests, TestCollectionListInputOutput) {
 
-  std::string path = "/root/Torch-TensorRT/list_input_output.ts";
+  std::string path = "tests/modules/list_input_output.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
@@ -296,7 +296,7 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
 
 TEST(CppAPITests, TestCollectionComplexModel) {
 
-  std::string path = "/root/Torch-TensorRT/complex_model.ts";
+  std::string path = "tests/modules/complex_model.jit.pt";
   torch::Tensor in0 = torch::randn({1, 3, 512, 512}, torch::kCUDA).to(torch::kHalf);
   std::vector<at::Tensor> inputs;
   inputs.push_back(in0);
diff --git a/tests/modules/hub.py b/tests/modules/hub.py
index f03658321c..a2adc3ab4b 100644
--- a/tests/modules/hub.py
+++ b/tests/modules/hub.py
@@ -3,11 +3,8 @@
 import torch.nn.functional as F
 import torchvision.models as models
 import timm
-<<<<<<< HEAD
 from transformers import BertModel, BertTokenizer, BertConfig
-=======
 from typing import Tuple, List, Dict
->>>>>>> feat: [collection] move collection test model to hub.py
 
 torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
 
@@ -286,29 +283,29 @@ def forward(self, z: List[torch.Tensor]):
 normal_model = Normal()
 normal_model_ts = torch.jit.script(normal_model)
 normal_model_ts.to("cuda").eval()
-torch.jit.save(normal_model_ts, "normal_model.ts")
+torch.jit.save(normal_model_ts, "normal_model.jit.pt")
 
 tuple_input = TupleInput()
 tuple_input_ts = torch.jit.script(tuple_input)
 tuple_input_ts.to("cuda").eval()
-torch.jit.save(tuple_input_ts, "tuple_input.ts")
+torch.jit.save(tuple_input_ts, "tuple_input.jit.pt")
 
 list_input = ListInput()
 list_input_ts = torch.jit.script(list_input)
 list_input_ts.to("cuda").eval()
-torch.jit.save(list_input_ts, "list_input.ts")
+torch.jit.save(list_input_ts, "list_input.jit.pt")
 
 tuple_input = TupleInputOutput()
 tuple_input_ts = torch.jit.script(tuple_input)
 tuple_input_ts.to("cuda").eval()
-torch.jit.save(tuple_input_ts, "tuple_input_output.ts")
+torch.jit.save(tuple_input_ts, "tuple_input_output.jit.pt")
 
 list_input = ListInputOutput()
 list_input_ts = torch.jit.script(list_input)
 list_input_ts.to("cuda").eval()
-torch.jit.save(list_input_ts, "list_input_output.ts")
+torch.jit.save(list_input_ts, "list_input_output.jit.pt")
 
 complex_model = ComplexModel()
 complex_model_ts = torch.jit.script(complex_model)
 complex_model_ts.to("cuda").eval()
-torch.jit.save(complex_model_ts, "complex_model.ts")
+torch.jit.save(complex_model_ts, "complex_model.jit.pt")

From 2d585e59d3a00974f32bd9f6d0c174a5be02903b Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Tue, 5 Apr 2022 12:08:24 +0800
Subject: [PATCH 15/22] fix: [collection] solve confict in ir.cpp

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/ir/ir.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/core/ir/ir.cpp b/core/ir/ir.cpp
index 5da2c121f6..061327c6bc 100644
--- a/core/ir/ir.cpp
+++ b/core/ir/ir.cpp
@@ -102,11 +102,8 @@ c10::optional<at::ScalarType> get_value_first_calc_dtype_opt(torch::jit::Block*
   auto consumers = in->uses();
   auto search_list = std::vector<torch::jit::Use>(consumers.begin(), consumers.end());
 
-  while(search_list.size() > 0) {
-    // after insertion, original iterator will be invalid
-    auto& u = search_list.front();
-    search_list.erase(search_list.begin());
-    auto n = u.user;
+  for (auto iter = search_list.begin(); iter != search_list.end(); ++iter) {
+    auto n = iter->user;
     LOG_GRAPH("Node we are looking at: " << util::node_info(n));
     auto ins = n->inputs();
     auto outs = n->outputs();

From 5f368105e5abf093ab7af811a2589e4f32d87d63 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Wed, 6 Apr 2022 21:07:45 +0800
Subject: [PATCH 16/22] feat: [collection] update python api, refactor code

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.h                               |  8 +-
 core/ir/BUILD                                 |  3 +-
 core/ir/GraphInputs.cpp                       | 75 +++++++++++++++++++
 core/ir/ir.h                                  |  2 +
 cpp/src/compile_spec.cpp                      | 67 ++++-------------
 .../csrc/register_tensorrt_classes.cpp        |  7 ++
 py/torch_tensorrt/csrc/tensorrt_classes.cpp   | 53 +++++++++++--
 py/torch_tensorrt/csrc/tensorrt_classes.h     |  8 ++
 py/torch_tensorrt/csrc/torch_tensorrt_py.cpp  |  7 ++
 py/torch_tensorrt/ts/_compile_spec.py         | 39 ++++++++--
 tests/py/test_collection.py                   | 60 +++++++++++++++
 11 files changed, 255 insertions(+), 74 deletions(-)
 create mode 100644 core/ir/GraphInputs.cpp
 create mode 100644 tests/py/test_collection.py

diff --git a/core/compiler.h b/core/compiler.h
index 85bc1d2c08..c8dc85020b 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -14,12 +14,8 @@ namespace torch_tensorrt {
 namespace core {
 
 struct CompileSpec {
-  CompileSpec(std::vector<ir::Input> inputs) {
-    graph_inputs.inputs = inputs;
-  }
-  CompileSpec(torch::jit::IValue& input_signature) {
-    graph_inputs.input_signature = input_signature;
-  }
+  CompileSpec(std::vector<ir::Input> inputs) : graph_inputs(inputs) {}
+  CompileSpec(torch::jit::IValue& input_signature) : graph_inputs(input_signature) {}
   ir::GraphInputs graph_inputs;
   conversion::ConversionInfo convert_info;
   lowering::LowerInfo lower_info;
diff --git a/core/ir/BUILD b/core/ir/BUILD
index a613aaf489..2e9ef7e6a8 100644
--- a/core/ir/BUILD
+++ b/core/ir/BUILD
@@ -15,7 +15,8 @@ cc_library(
     srcs = [
         "ir.cpp",
         "Input.cpp",
-        "StaticParams.cpp"
+        "StaticParams.cpp",
+        "GraphInputs.cpp"
     ],
     deps = [
         "@tensorrt//:nvinfer",
diff --git a/core/ir/GraphInputs.cpp b/core/ir/GraphInputs.cpp
new file mode 100644
index 0000000000..645624f2f1
--- /dev/null
+++ b/core/ir/GraphInputs.cpp
@@ -0,0 +1,75 @@
+#include "core/ir/ir.h"
+#include "core/util/prelude.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace ir {
+
+void flatten_dfs(std::vector<torch_tensorrt::core::ir::Input>& flattened_inputs, std::vector<std::vector<torch_tensorrt::core::ir::Input>>& collection_inputs, 
+                 torch::jit::IValue input_ivalue, int level, int index) {
+    if (input_ivalue.isTuple()) {
+      auto input_tuple = input_ivalue.toTuple();
+      int idx = 0;
+      if (level == 0) {
+        collection_inputs.resize(input_tuple->elements().size());
+      }
+      for (auto item: input_tuple->elements()) {
+        torch::jit::IValue converted_item;
+        int cur_idx = level < 1 ? idx: index;
+        flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx);
+        idx++;
+      }
+    } else if(input_ivalue.isList()) {
+      auto input_list = input_ivalue.toList().vec();
+      if (level == 0) {
+        collection_inputs.resize(input_list.size());
+      }
+      c10::TypePtr type = input_list[0].type();
+      auto converted_elements = c10::impl::GenericList(type);
+      int idx = 0;
+      for (auto item: input_list) {
+        int cur_idx = level < 1 ? idx: index;
+        flatten_dfs(flattened_inputs, collection_inputs, item, level+1, cur_idx);
+        idx++;
+      }
+    } else if(input_ivalue.isCustomClass()) {
+      torch_tensorrt::core::ir::Input cur_input = *(input_ivalue.toCustomClass<torch_tensorrt::core::ir::Input>());
+      flattened_inputs.push_back(cur_input);
+      if (level == 0) {  // a single value like A
+        collection_inputs.resize(1);
+        collection_inputs[0].push_back(cur_input);
+      } else if (level == 1) { // like A in [A, A] or [(B, B), A]
+        collection_inputs[index].push_back(cur_input);
+      } else if (level == 2) {  // like A in [(A, A), C]
+        collection_inputs[index].push_back(cur_input);
+      } else {// only support 2 level
+        LOG_ERROR("Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]");
+      }
+    }
+}
+
+
+GraphInputs::GraphInputs(std::vector<ir::Input> inputs_) {
+    LOG_DEBUG("Construct GraphInput with ir::Input");
+    inputs = inputs_;
+    collection_inputs.resize(inputs_.size());
+    for (int i = 0; i < inputs_.size(); i++) {
+        collection_inputs[i].push_back(inputs_[i]);
+    }
+}
+
+GraphInputs::GraphInputs(torch::jit::IValue& input_signature_) {
+    LOG_DEBUG("Construct GraphInput with IValue");
+
+    std::vector<torch_tensorrt::core::ir::Input> flattened_inputs;
+    std::vector<std::vector<torch_tensorrt::core::ir::Input>> collection_inputs_;
+
+    flatten_dfs(flattened_inputs, collection_inputs_, input_signature_, 0, 0);
+    inputs = flattened_inputs;
+    input_signature = input_signature_;
+    collection_inputs = collection_inputs_;
+}
+
+} // namespace ir
+} // namespace core
+} // namespace torch_tensorrt
\ No newline at end of file
diff --git a/core/ir/ir.h b/core/ir/ir.h
index c138ad693b..966c747176 100644
--- a/core/ir/ir.h
+++ b/core/ir/ir.h
@@ -40,6 +40,8 @@ struct Input : torch::CustomClassHolder {
 
 // Add to spec
 struct GraphInputs {
+  GraphInputs(std::vector<ir::Input> inputs);
+  GraphInputs(torch::jit::IValue& input_signature);
   torch::jit::IValue input_signature;  // nested Input, full input spec
   std::vector<Input> inputs;  // flattend Input
   std::vector<std::vector<Input>> collection_inputs; // only support two layer nesting, e.g. ((a, b), [c, d], e)
diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp
index 366476b227..9447def7e0 100644
--- a/cpp/src/compile_spec.cpp
+++ b/cpp/src/compile_spec.cpp
@@ -38,88 +38,47 @@ CompileSpec::CompileSpec(torch::jit::IValue input_signature) {
 
 
 
-void flatten_dfs(std::vector<torchtrt::core::ir::Input>& flattened_inputs, std::vector<std::vector<torchtrt::core::ir::Input>>& collection_inputs, 
-                 torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue, int level, int index) {
+void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) {
     if (input_ivalue.isTuple()) {
       auto input_tuple = input_ivalue.toTuple();
       std::vector<torch::jit::IValue> converted_elements;
-      int idx = 0;
-      if (level == 0) {
-        collection_inputs.resize(input_tuple->elements().size());
-      }
       for (auto item: input_tuple->elements()) {
         torch::jit::IValue converted_item;
-        int cur_idx = level < 1 ? idx: index;
-        flatten_dfs(flattened_inputs, collection_inputs, item, converted_item, level+1, cur_idx);
+        to_internal_input_signature(item, converted_item);
         converted_elements.push_back(converted_item);
         auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
         converted_ivalue = torch::jit::IValue(tuple_ptr);
-        idx++;
       }
     } else if(input_ivalue.isList()) {
       auto input_list = input_ivalue.toList().vec();
-      if (level == 0) {
-        collection_inputs.resize(input_list.size());
-      }
       c10::TypePtr type = input_list[0].type();
       auto converted_elements = c10::impl::GenericList(type);
-      int idx = 0;
       for (auto item: input_list) {
-        int cur_idx = level < 1 ? idx: index;
         torch::jit::IValue converted_item;
-        flatten_dfs(flattened_inputs, collection_inputs, item, converted_item, level+1, cur_idx);
+        to_internal_input_signature(item, converted_item);
         converted_elements.push_back(converted_item);
-        idx++;
       }
       converted_ivalue = torch::jit::IValue(converted_elements);
     } else if(input_ivalue.isCustomClass()) {
       torchtrt::core::ir::Input cur_input = to_internal_input(*(input_ivalue.toCustomClass<torchtrt::Input>()));
-      flattened_inputs.push_back(cur_input);
       converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<torch_tensorrt::core::ir::Input>(cur_input)));
-      if (level == 0) {  // a single value like A
-        collection_inputs.resize(1);
-        collection_inputs[0].push_back(cur_input);
-      } else if (level == 1) { // like A in [A, A] or [(B, B), A]
-        collection_inputs[index].push_back(cur_input);
-      } else if (level == 2) {  // like A in [(A, A), C]
-        collection_inputs[index].push_back(cur_input);
-      } else {// only support 2 level
-        LOG_ERROR("Input nesting depth exceeds currently supported depth (3), use 1 level: [A, B], or 2 level: [A, (B, C)]");
-      }
     }
 }
 
-
-torch_tensorrt::core::ir::GraphInputs to_internal_graph_inputs(GraphInputs external_graph_input) {
-  torch_tensorrt::core::ir::GraphInputs internal_graph_input;
-
-    std::vector<torchtrt::core::ir::Input> flattened_inputs;
-    std::vector<std::vector<torchtrt::core::ir::Input>> collection_inputs;
-
+torchtrt::core::CompileSpec init_compile_spec(CompileSpec external) {
+  if (external.graph_inputs.inputs.size() > 0) {
+    torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.graph_inputs.inputs));
+    return internal;
+  } else {
     torch::jit::IValue converted_input_signature;
-    flatten_dfs(flattened_inputs, collection_inputs, external_graph_input.input_signature, converted_input_signature, 0, 0);
-    internal_graph_input.inputs = flattened_inputs;
-    internal_graph_input.input_signature = converted_input_signature;
-    internal_graph_input.collection_inputs = collection_inputs;
-
-    LOG_DEBUG("Convert external_graph_input to internal_graph_inputs, total input input spec number: " << flattened_inputs.size() << ", top level input spec number "<< collection_inputs.size());
-
-  return internal_graph_input;
+    to_internal_input_signature(external.graph_inputs.input_signature, converted_input_signature);
+    torchtrt::core::CompileSpec internal(converted_input_signature);
+    return internal;
+  }
 }
 
 torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
-  torchtrt::core::CompileSpec internal(to_vec_internal_inputs(external.graph_inputs.inputs));
-  if (internal.graph_inputs.inputs.size() == 0) {
-    LOG_DEBUG("GraphInput.inputs size == 0, using GraphInput.input_signature to get Input spec");
-    internal.graph_inputs = to_internal_graph_inputs(external.graph_inputs);
-  } else {
-    LOG_DEBUG("GraphInput.inputs size != 0, using GraphInput.inputs to get Input spec");
-    internal.graph_inputs.collection_inputs.resize(internal.graph_inputs.inputs.size());
-    for (int i = 0; i < internal.graph_inputs.inputs.size(); i++) {
-      internal.graph_inputs.collection_inputs[i].push_back(internal.graph_inputs.inputs[i]);
-    }
-  }
-
+  torchtrt::core::CompileSpec internal = init_compile_spec(external);
 
   for (auto p : external.enabled_precisions) {
     internal.convert_info.engine_settings.enabled_precisions.insert(toTRTDataType(p));
diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
index 53b9fc2cdb..0a9f357c47 100644
--- a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
@@ -23,6 +23,13 @@ void RegisterTRTCompileSpec() {
   ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistration, torch_tensorrt::pyapi::Input, input_is_dynamic);
   ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistration, torch_tensorrt::pyapi::Input, explicit_set_dtype);
 
+  static auto TORCHTRT_UNUSED TRTGraphInpuTSRegistration =
+      torch::class_<torch_tensorrt::pyapi::GraphInputs>("tensorrt", "_GraphInputs")
+          .def(torch::init<>())
+          .def("__str__", &torch_tensorrt::pyapi::GraphInputs::to_str);
+
+  ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistration, torch_tensorrt::pyapi::GraphInputs, input_signature);
+
   static auto TORCHTRT_UNUSED TRTDeviceTSRegistration =
       torch::class_<torch_tensorrt::pyapi::Device>("tensorrt", "_Device")
           .def(torch::init<>())
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
index a89fe692bd..9d2761ba95 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp
@@ -104,6 +104,11 @@ std::string Input::to_str() {
   return ss.str();
 }
 
+std::string GraphInputs::to_str() {
+  std::stringstream ss;
+  return ss.str();
+}
+
 std::string to_str(DeviceType value) {
   switch (value) {
     case DeviceType::kDLA:
@@ -184,13 +189,51 @@ std::string TorchFallback::to_str() {
   return ss.str();
 }
 
-core::CompileSpec CompileSpec::toInternalCompileSpec() {
-  std::vector<core::ir::Input> internal_inputs;
-  for (auto i : inputs) {
-    internal_inputs.push_back(i.toInternalInput());
+void to_internal_input_signature(torch::jit::IValue input_ivalue, torch::jit::IValue& converted_ivalue) {
+    if (input_ivalue.isTuple()) {
+      auto input_tuple = input_ivalue.toTuple();
+      std::vector<torch::jit::IValue> converted_elements;
+      for (auto item: input_tuple->elements()) {
+        torch::jit::IValue converted_item;
+        to_internal_input_signature(item, converted_item);
+        converted_elements.push_back(converted_item);
+        auto tuple_ptr = c10::ivalue::Tuple::create(converted_elements);
+        converted_ivalue = torch::jit::IValue(tuple_ptr);
+      }
+    } else if(input_ivalue.isList()) {
+      auto input_list = input_ivalue.toList().vec();
+      c10::TypePtr type = input_list[0].type();
+      auto converted_elements = c10::impl::GenericList(type);
+      for (auto item: input_list) {
+        torch::jit::IValue converted_item;
+        to_internal_input_signature(item, converted_item);
+        converted_elements.push_back(converted_item);
+      }
+      converted_ivalue = torch::jit::IValue(converted_elements);
+    } else if(input_ivalue.isCustomClass()) {
+      core::ir::Input cur_input = (*(input_ivalue.toCustomClass<Input>())).toInternalInput();
+      converted_ivalue = torch::jit::IValue(std::move(c10::make_intrusive<core::ir::Input>(cur_input)));
+    }
+}
+
+core::CompileSpec init_compile_spec(CompileSpec external) {
+  if (external.graph_inputs.inputs.size() > 0) {
+    std::vector<core::ir::Input> internal_inputs;
+    for (auto i : external.graph_inputs.inputs) {
+      internal_inputs.push_back(i.toInternalInput());
+    }
+    core::CompileSpec internal(internal_inputs);
+    return internal;
+  } else {
+    torch::jit::IValue converted_input_signature;
+    to_internal_input_signature(external.graph_inputs.input_signature, converted_input_signature);
+    core::CompileSpec internal(converted_input_signature);
+    return internal;
   }
+}
 
-  auto info = core::CompileSpec(internal_inputs);
+core::CompileSpec CompileSpec::toInternalCompileSpec() {
+  core::CompileSpec info = init_compile_spec(*this);
 
   for (auto p : enabled_precisions) {
     info.convert_info.engine_settings.enabled_precisions.insert(toTRTDataType(p));
diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h
index 0c80641005..7231efa0fa 100644
--- a/py/torch_tensorrt/csrc/tensorrt_classes.h
+++ b/py/torch_tensorrt/csrc/tensorrt_classes.h
@@ -57,6 +57,13 @@ struct Input : torch::CustomClassHolder {
   std::string to_str();
 };
 
+struct GraphInputs : torch::CustomClassHolder {
+  torch::jit::IValue input_signature;  // nested Input, full input spec
+  std::vector<Input> inputs; // flatten input spec
+  ADD_FIELD_GET_SET(input_signature, torch::jit::IValue);
+  std::string to_str();
+};
+
 enum DeviceType : int8_t {
   kGPU,
   kDLA,
@@ -156,6 +163,7 @@ struct CompileSpec : torch::CustomClassHolder {
   ADD_FIELD_GET_SET(ptq_calibrator, nvinfer1::IInt8Calibrator*);
 
   std::vector<Input> inputs;
+  GraphInputs graph_inputs;
   nvinfer1::IInt8Calibrator* ptq_calibrator = nullptr;
   std::set<DataType> enabled_precisions = {};
   bool sparse_weights = false;
diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
index 6e5f333f78..8e89441f56 100644
--- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
+++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
@@ -178,6 +178,12 @@ PYBIND11_MODULE(_C, m) {
       .def_readwrite("dtype", &Input::dtype)
       .def_readwrite("format", &Input::format);
 
+  py::class_<GraphInputs>(m, "GraphInputs")
+      .def(py::init<>())
+      .def("__str__", &torch_tensorrt::pyapi::GraphInputs::to_str)
+      .def_readwrite("input_signature", &GraphInputs::input_signature)
+      .def_readwrite("inputs", &GraphInputs::inputs);
+
   py::enum_<DataType>(m, "dtype", "Enum to specifiy operating precision for engine execution")
       .value("float", DataType::kFloat, "32 bit floating point number")
       .value("float32", DataType::kFloat, "32 bit floating point number")
@@ -292,6 +298,7 @@ PYBIND11_MODULE(_C, m) {
       .def("__str__", &torch_tensorrt::pyapi::CompileSpec::stringify)
       .def("_get_calibrator_handle", &CompileSpec::getPTQCalibratorHandle, "[Internal] gets a handle from a calibrator")
       .def_readwrite("inputs", &CompileSpec::inputs)
+      .def_readwrite("graph_inputs", &CompileSpec::graph_inputs)
       .def_readwrite("enabled_precisions", &CompileSpec::enabled_precisions)
       .def_readwrite("ptq_calibrator", &CompileSpec::ptq_calibrator)
       .def_readwrite("refit", &CompileSpec::refit)
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index e406096677..5c046a7d1d 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -5,7 +5,7 @@
 from torch_tensorrt import _enums
 from torch_tensorrt._Input import Input
 from torch_tensorrt._Device import Device
-
+from typing import Tuple, List, Dict
 import warnings
 
 
@@ -156,6 +156,24 @@ def _parse_torch_fallback(fallback_info: Dict[str, Any]) -> _ts_C.TorchFallback:
 
     return info
 
+def _parse_collection_input(input_signature: Any) -> _C.GraphInputs.input_signature:
+    if isinstance(input_signature, tuple):
+        input_list = []
+        for item in input_signature:
+           input = _parse_collection_input(item)
+           input_list.append(input)
+        return tuple(input_list)
+    elif isinstance(input_signature, list):
+        input_list = []
+        for item in input_signature:
+           input = _parse_collection_input(item)
+        input_list.append(input)
+        return input_list
+    elif isinstance(input_signature, Input) or isinstance(input_signature, torch.Tensor):
+        input = Input._from_tensor(input_signature) if isinstance(input_signature, torch.Tensor) else input_signature
+        return input._to_internal()
+    else:
+        raise KeyError("Invalid Input spec")
 
 def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
     info = _ts_C.CompileSpec()
@@ -165,14 +183,19 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
         )
 
     if "inputs" in compile_spec:
-        if not all([isinstance(i, torch.Tensor) or isinstance(i, Input) for i in compile_spec["inputs"]]):
-            raise KeyError("Input specs should be either torch_tensorrt.Input or torch.Tensor, found types: {}".format(
-                [type(i) for i in compile_spec["inputs"]]))
-
-        inputs = [Input._from_tensor(i) if isinstance(i, torch.Tensor) else i for i in compile_spec["inputs"]]
-        info.inputs = [i._to_internal() for i in inputs]
+        # if not all([isinstance(i, torch.Tensor) or isinstance(i, Input) for i in compile_spec["inputs"]]):
+        #     raise KeyError("Input specs should be either torch_tensorrt.Input or torch.Tensor, found types: {}".format(
+        #         [type(i) for i in compile_spec["inputs"]]))
+
+        if isinstance(compile_spec["inputs"], list) and all([isinstance(i, torch.Tensor) or isinstance(i, Input) for i in compile_spec["inputs"]]):
+            inputs = [Input._from_tensor(i) if isinstance(i, torch.Tensor) else i for i in compile_spec["inputs"]]
+            # from python Input to torch_tensorrt::pyapi::Input
+            # info.inputs = [i._to_internal() for i in inputs]
+            info.graph_inputs.inputs = [i._to_internal() for i in inputs]
+        else:
+            info.graph_inputs.input_signature = _parse_collection_input(compile_spec["inputs"])
 
-    assert (len(info.inputs) > 0), "Require at least one input definition to compile model"
+    assert (len(info.graph_inputs.inputs) > 0), "Require at least one input definition to compile model"
 
     if "enabled_precisions" in compile_spec:
         info.enabled_precisions = _parse_enabled_precisions(compile_spec["enabled_precisions"])
diff --git a/tests/py/test_collection.py b/tests/py/test_collection.py
new file mode 100644
index 0000000000..23e15c99b3
--- /dev/null
+++ b/tests/py/test_collection.py
@@ -0,0 +1,60 @@
+import torch
+import torch.nn as nn
+import torch_tensorrt as torchtrt
+from typing import Tuple, List, Dict
+
+class Normal(nn.Module):
+    def __init__(self):
+        super(Normal, self).__init__()
+
+    def forward(self, x, y):
+        r = x + y
+        return r
+
+class TupleInputOutput(nn.Module):
+    def __init__(self):
+        super(TupleInputOutput, self).__init__()
+
+    def forward(self, z: Tuple[torch.Tensor, torch.Tensor]):
+        r1 = z[0] + z[1]
+        r2 = z[0] - z[1]
+        r = (r1, r2)
+        return r
+
+input = torch.randn((1, 3, 224, 224)).to("cuda")
+normal_model = Normal()
+scripted_model = torch.jit.script(normal_model)
+
+compile_spec = {
+    "inputs": [torchtrt.Input(input.shape, dtype=torch.float, format=torch.contiguous_format),
+               torchtrt.Input(input.shape, dtype=torch.float, format=torch.contiguous_format)],
+    "device": {
+        "device_type": torchtrt.DeviceType.GPU,
+        "gpu_id": 0,
+    },
+    "enabled_precisions": {torch.float}
+}
+
+trt_mod = torchtrt.ts.compile(scripted_model, **compile_spec)
+same = (trt_mod(input, input) - scripted_model(input, input)).abs().max()
+print(same.cpu())
+
+# input = torch.randn((1, 3, 224, 224)).to("cuda")
+# tuple_model = TupleInputOutput()
+# scripted_model = torch.jit.script(tuple_model)
+
+# compile_spec = {
+#     "inputs": (torchtrt.Input(input.shape, dtype=torch.float, format=torch.contiguous_format),
+#                torchtrt.Input(input.shape, dtype=torch.float, format=torch.contiguous_format)),
+#     "device": {
+#         "device_type": torchtrt.DeviceType.GPU,
+#         "gpu_id": 0,
+#     },
+#     "enabled_precisions": {torch.float}
+# }
+
+# trt_mod = torchtrt.ts.compile(scripted_model, **compile_spec)
+# same = (trt_mod((input, input))[0] - scripted_model((input, input))[0]).abs().max()
+# print(same.cpu())
+
+

From d9d86656c94d7633e2e5d2f27c4740b6fad9827d Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Fri, 8 Apr 2022 18:26:41 +0800
Subject: [PATCH 17/22] fix: [collection] remove aten::__getitem__ and
 prim::ListConstruct

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/conversion/evaluators/aten.cpp | 15 -------
 core/conversion/evaluators/prim.cpp | 62 -----------------------------
 2 files changed, 77 deletions(-)

diff --git a/core/conversion/evaluators/aten.cpp b/core/conversion/evaluators/aten.cpp
index 30cdeaa46a..fde9e71e66 100644
--- a/core/conversion/evaluators/aten.cpp
+++ b/core/conversion/evaluators/aten.cpp
@@ -264,21 +264,6 @@ auto aten_registrations TORCHTRT_UNUSED =
              },
              EvalOptions().validSchemas(
                  {"aten::size(Tensor self) -> (int[])", "aten::size.int(Tensor self, int dim) -> (int)"})})
-        .evaluator({c10::Symbol::fromQualString("aten::__getitem__"),
-                    [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
-                      auto list = args.at(n->input(0)).IValue()->to<c10::List<c10::IValue>>();
-                      auto idx = args.at(n->input(1)).unwrapToInt();
-
-                      const int64_t list_size = list.size();
-                      const int64_t normalized_idx = normalizeIndex(idx, list_size);
-                      TORCHTRT_CHECK(
-                          normalized_idx >= 0 || normalized_idx < list_size,
-                          "List index out of range (aten::__getitem__)");
-                      return list.get(normalized_idx);
-                    },
-                    EvalOptions().validSchemas({
-                        "aten::__getitem__.t(t[](a) list, int idx) -> (t(*))",
-                    })})
         .evaluator({c10::Symbol::fromQualString("aten::append"),
                     [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
                       auto list = args.at(n->input(0)).IValue()->to<c10::List<c10::IValue>>();
diff --git a/core/conversion/evaluators/prim.cpp b/core/conversion/evaluators/prim.cpp
index 7d5373a5f9..56e980189f 100755
--- a/core/conversion/evaluators/prim.cpp
+++ b/core/conversion/evaluators/prim.cpp
@@ -40,68 +40,6 @@ auto prim_registrations =
                       auto outputVec = outputs->toList().vec();
                       return std::move(c10::ivalue::Tuple::create(outputVec));
                     }})
-        .evaluator({torch::jit::prim::ListConstruct,
-                    [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
-                      const auto num_inputs = n->inputs().size();
-                      if (constTypesOnly(args)) {
-                        c10::ListTypePtr lt = n->output()->type()->expect<c10::ListType>();
-                        if (torch::jit::IntType::get() == lt->getElementType()) {
-                          c10::List<int64_t> list;
-                          list.reserve(num_inputs);
-                          for (auto in : n->inputs()) {
-                            list.emplace_back(std::move(args.at(in).unwrapToInt()));
-                          }
-                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
-                        } else if (torch::jit::FloatType::get() == lt->getElementType()) {
-                          c10::List<double> list;
-                          list.reserve(num_inputs);
-                          for (auto in : n->inputs()) {
-                            list.emplace_back(std::move(args.at(in).unwrapToDouble()));
-                          }
-                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
-                        } else if (lt->getElementType() == torch::jit::BoolType::get()) {
-                          c10::List<bool> list;
-                          list.reserve(num_inputs);
-                          for (auto in : n->inputs()) {
-                            list.emplace_back(std::move(args.at(in).unwrapToBool()));
-                          }
-                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
-                        } else if (lt->getElementType()->isSubtypeOf(torch::jit::TensorType::get())) {
-                          c10::List<at::Tensor> list;
-                          list.reserve(num_inputs);
-                          for (auto in : n->inputs()) {
-                            if (args.at(in).isIValue()) {
-                              list.emplace_back(std::move(args.at(in).unwrapToTensor()));
-                            }
-                          }
-                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
-                        } else {
-                          c10::TypePtr elementType = lt->getElementType();
-                          auto list = c10::impl::GenericList(elementType);
-                          list.reserve(num_inputs);
-                          for (auto in : n->inputs()) {
-                            list.emplace_back(std::move(*(args.at(in).IValue())));
-                          }
-                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
-                        }
-                      } else {
-                        c10::ListTypePtr lt = n->output()->type()->expect<c10::ListType>();
-                        c10::TypePtr elementType = lt->getElementType();
-                        auto list = c10::impl::GenericList(elementType);
-                        list.reserve(num_inputs);
-                        for (auto in : n->inputs()) {
-                          if (args.at(in).isITensor()) {
-                            auto tensor_holder = TensorContainer();
-                            tensor_holder.hold_tensor(args.at(in).ITensor());
-                            auto ival = c10::IValue(std::move(c10::make_intrusive<TensorContainer>(tensor_holder)));
-                            list.emplace_back(std::move(ival));
-                          } else {
-                            list.emplace_back(std::move(args.at(in).unwrapToTensor()));
-                          }
-                        }
-                        return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
-                      }
-                    }})
         .evaluator({c10::Symbol::fromQualString("prim::dtype"),
                     [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
                       auto input = args.at(n->input(0));

From 991f0232d046e50420cabf6e8cafa1c862b56061 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Tue, 12 Apr 2022 11:51:53 +0800
Subject: [PATCH 18/22] [collection] rebase to master, update some api

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/conversion/evaluators/prim.cpp           | 62 +++++++++++++++++++
 core/partitioning/shape_analysis.h            |  3 -
 .../test_resolve_nontensor_inputs.cpp         | 16 ++---
 .../core/partitioning/test_shape_analysis.cpp | 16 ++---
 4 files changed, 78 insertions(+), 19 deletions(-)

diff --git a/core/conversion/evaluators/prim.cpp b/core/conversion/evaluators/prim.cpp
index 56e980189f..7d5373a5f9 100755
--- a/core/conversion/evaluators/prim.cpp
+++ b/core/conversion/evaluators/prim.cpp
@@ -40,6 +40,68 @@ auto prim_registrations =
                       auto outputVec = outputs->toList().vec();
                       return std::move(c10::ivalue::Tuple::create(outputVec));
                     }})
+        .evaluator({torch::jit::prim::ListConstruct,
+                    [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
+                      const auto num_inputs = n->inputs().size();
+                      if (constTypesOnly(args)) {
+                        c10::ListTypePtr lt = n->output()->type()->expect<c10::ListType>();
+                        if (torch::jit::IntType::get() == lt->getElementType()) {
+                          c10::List<int64_t> list;
+                          list.reserve(num_inputs);
+                          for (auto in : n->inputs()) {
+                            list.emplace_back(std::move(args.at(in).unwrapToInt()));
+                          }
+                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+                        } else if (torch::jit::FloatType::get() == lt->getElementType()) {
+                          c10::List<double> list;
+                          list.reserve(num_inputs);
+                          for (auto in : n->inputs()) {
+                            list.emplace_back(std::move(args.at(in).unwrapToDouble()));
+                          }
+                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+                        } else if (lt->getElementType() == torch::jit::BoolType::get()) {
+                          c10::List<bool> list;
+                          list.reserve(num_inputs);
+                          for (auto in : n->inputs()) {
+                            list.emplace_back(std::move(args.at(in).unwrapToBool()));
+                          }
+                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+                        } else if (lt->getElementType()->isSubtypeOf(torch::jit::TensorType::get())) {
+                          c10::List<at::Tensor> list;
+                          list.reserve(num_inputs);
+                          for (auto in : n->inputs()) {
+                            if (args.at(in).isIValue()) {
+                              list.emplace_back(std::move(args.at(in).unwrapToTensor()));
+                            }
+                          }
+                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+                        } else {
+                          c10::TypePtr elementType = lt->getElementType();
+                          auto list = c10::impl::GenericList(elementType);
+                          list.reserve(num_inputs);
+                          for (auto in : n->inputs()) {
+                            list.emplace_back(std::move(*(args.at(in).IValue())));
+                          }
+                          return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+                        }
+                      } else {
+                        c10::ListTypePtr lt = n->output()->type()->expect<c10::ListType>();
+                        c10::TypePtr elementType = lt->getElementType();
+                        auto list = c10::impl::GenericList(elementType);
+                        list.reserve(num_inputs);
+                        for (auto in : n->inputs()) {
+                          if (args.at(in).isITensor()) {
+                            auto tensor_holder = TensorContainer();
+                            tensor_holder.hold_tensor(args.at(in).ITensor());
+                            auto ival = c10::IValue(std::move(c10::make_intrusive<TensorContainer>(tensor_holder)));
+                            list.emplace_back(std::move(ival));
+                          } else {
+                            list.emplace_back(std::move(args.at(in).unwrapToTensor()));
+                          }
+                        }
+                        return c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+                      }
+                    }})
         .evaluator({c10::Symbol::fromQualString("prim::dtype"),
                     [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
                       auto input = args.at(n->input(0));
diff --git a/core/partitioning/shape_analysis.h b/core/partitioning/shape_analysis.h
index 46450eb0f8..2654699a1d 100644
--- a/core/partitioning/shape_analysis.h
+++ b/core/partitioning/shape_analysis.h
@@ -6,9 +6,6 @@ namespace torch_tensorrt {
 namespace core {
 namespace partitioning {
 
-// std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
-//     std::unordered_map<const torch::jit::Value*, ir::Input>& input_ranges,
-//     std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>>& input_types);
 
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
     std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& input_ranges,
diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
index a83d2330e4..7daaedab8c 100644
--- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
+++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
@@ -116,11 +116,11 @@ TEST(Partitioning, ResolveNonTensorInputsCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({16, 3, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({16}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::vector<torch_tensorrt::core::partitioning::SegmentedBlock> segmented_blocks =
@@ -174,11 +174,11 @@ TEST(Partitioning, ResolveTensorListInputsInTrtCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({16, 6, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({16}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::vector<torch_tensorrt::core::partitioning::SegmentedBlock> segmented_blocks =
diff --git a/tests/core/partitioning/test_shape_analysis.cpp b/tests/core/partitioning/test_shape_analysis.cpp
index 8effa821ae..d05f10c163 100644
--- a/tests/core/partitioning/test_shape_analysis.cpp
+++ b/tests/core/partitioning/test_shape_analysis.cpp
@@ -59,11 +59,11 @@ TEST(Partitioning, InferSequentialModelSegmentedBlockShapeCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({8, 16, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({8}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::vector<torch_tensorrt::core::partitioning::SegmentedBlock> segmented_blocks =
@@ -109,11 +109,11 @@ TEST(Partitioning, InferBranchModelSegmentedBlockShapeCorrectly) {
   inputs.push_back(torch_tensorrt::core::ir::Input({16, 32, 3, 3}));
   inputs.push_back(torch_tensorrt::core::ir::Input({16}));
 
-  std::unordered_map<const torch::jit::Value*, torch_tensorrt::core::ir::Input> inputs_map;
-  std::unordered_map<const torch::jit::Value*, c10::optional<at::ScalarType>> input_types;
+  std::unordered_map<const torch::jit::Value*, std::vector<torch_tensorrt::core::ir::Input>> inputs_map;
+  std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>> input_types;
   for (size_t i = 0; i < g->inputs().size(); ++i) {
-    inputs_map.insert({g->inputs()[i], inputs[i]});
-    input_types.insert({g->inputs()[i], {at::kFloat}});
+    inputs_map.insert({g->inputs()[i], {inputs[i]}});
+    input_types.insert({g->inputs()[i], {{at::kFloat}}});
   }
   auto input_ivalues_map = torch_tensorrt::core::partitioning::generateRandomInputs(inputs_map, input_types);
   std::vector<torch_tensorrt::core::partitioning::SegmentedBlock> segmented_blocks =

From 016c991faf7632add3a4929b16d197c826bfc694 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 14 Apr 2022 20:14:52 +0800
Subject: [PATCH 19/22] feat: [collection] handle prim::ListConstruct without
 fallback it manually

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/partitioning/partitioning.cpp | 24 +++++++++++++++++-------
 tests/cpp/test_collection.cpp      |  2 --
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index d171ae15c0..93ee4ab2a6 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -419,6 +419,15 @@ bool checkLoopEvaluatable(torch::jit::Node* n) {
   return compile_to_trt;
 }
 
+bool is_collection(torch::jit::Node* n) {
+  for (auto out: n->outputs()) {
+    if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool should_run_in_trt(torch::jit::Node* n, const std::unordered_set<std::string>& torch_ops) {
   // If the op is not supported by the conversion phase it should run in PyTorch
   if (!conversion::OpSupported(n)) {
@@ -459,18 +468,19 @@ PartitionedGraph segment_graph(torch::jit::Block* block, const PartitionInfo& pa
       partition_info.forced_fallback_operators.begin(), partition_info.forced_fallback_operators.end());
 
   auto nodes = block->nodes();
+  auto reverse_nodes = nodes.reverse(); // merge from output side to input side
   PartitionedGraph segmented_blocks;
 
   // segment the nodes
   std::vector<torch::jit::Node*> in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes;
-  for (const auto n : nodes) {
+  for (const auto n : reverse_nodes) {
     // Skip constant nodes as they are resources for both kinds of modules
     if (n->kind() == torch::jit::prim::Constant) {
       continue;
     }
-
-    if (should_run_in_trt(n, forced_fallback_ops)) {
-      in_prog_trt_blk_nodes.push_back(n);
+    // the outputs of trt subgraph shouldn't be collections
+    if (should_run_in_trt(n, forced_fallback_ops) && !(in_prog_trt_blk_nodes.size() == 0 && is_collection(n))) {
+      in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n);
 
       // If there is an active PyTorch block and we have passed the threshold for a valid TRT
       // block then segment and reset the active PyTorch block
@@ -505,14 +515,14 @@ PartitionedGraph segment_graph(torch::jit::Block* block, const PartitionInfo& pa
           finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes);
         }
         if (checkLoopEvaluatable(n)) {
-          in_prog_trt_blk_nodes.push_back(n);
+          in_prog_trt_blk_nodes.insert(in_prog_trt_blk_nodes.begin(), n);
         } else {
           auto loop_node = std::vector<torch::jit::Node*>{n};
           finalize_block(segmented_blocks, SegmentedBlock::kTorch, loop_node);
         }
         continue;
       }
-      in_prog_pyt_blk_nodes.push_back(n);
+      in_prog_pyt_blk_nodes.insert(in_prog_pyt_blk_nodes.begin(), n);
     }
   }
 
@@ -527,7 +537,7 @@ PartitionedGraph segment_graph(torch::jit::Block* block, const PartitionInfo& pa
         in_prog_pyt_blk_nodes.end(), in_prog_trt_blk_nodes.begin(), in_prog_trt_blk_nodes.end());
     finalize_block(segmented_blocks, SegmentedBlock::kTorch, in_prog_pyt_blk_nodes);
   }
-
+  std::reverse(segmented_blocks.begin(), segmented_blocks.end());
   return segmented_blocks;
 }
 
diff --git a/tests/cpp/test_collection.cpp b/tests/cpp/test_collection.cpp
index 73bcabcf13..9308d951f4 100644
--- a/tests/cpp/test_collection.cpp
+++ b/tests/cpp/test_collection.cpp
@@ -280,7 +280,6 @@ TEST(CppAPITests, TestCollectionListInputOutput) {
 
   // Need to skip the conversion of __getitem__ and ListConstruct
   compile_settings.torch_executed_ops.push_back("aten::__getitem__");
-  compile_settings.torch_executed_ops.push_back("prim::ListConstruct");
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};
@@ -351,7 +350,6 @@ TEST(CppAPITests, TestCollectionComplexModel) {
 
   // Need to skip the conversion of __getitem__ and ListConstruct
   compile_settings.torch_executed_ops.push_back("aten::__getitem__");
-  compile_settings.torch_executed_ops.push_back("prim::ListConstruct");
 
   // // FP16 execution
   compile_settings.enabled_precisions = {torch::kHalf};

From 2e7cd5899d1b20946b419e218f00d4824ab2737b Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 14 Apr 2022 21:57:21 +0800
Subject: [PATCH 20/22] chore: [collection] update
 test_resolve_nontensor_inputs.cpp

Signed-off-by: inocsin <vcheungyi@163.com>
---
 tests/core/partitioning/test_resolve_nontensor_inputs.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
index 7daaedab8c..e70d5d2b5d 100644
--- a/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
+++ b/tests/core/partitioning/test_resolve_nontensor_inputs.cpp
@@ -255,5 +255,5 @@ TEST(Partitioning, ConvertForTensorListInputsInFallbackCorrectly) {
   torch::jit::script::Module new_mod = torch_tensorrt::core::CompileGraph(mod, cfg);
   auto fallback_g = new_mod.get_method("forward").graph();
   int count = count_trt_engines(fallback_g);
-  ASSERT_TRUE(count == 2);
+  ASSERT_TRUE(count == 1);
 }

From b35cdd06e4910a978e35b7f7225a0b05331ff489 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Thu, 14 Apr 2022 22:36:03 +0800
Subject: [PATCH 21/22] fix: [collection] handle the case that only the output
 is collection and all the nodes can be converted

Signed-off-by: inocsin <vcheungyi@163.com>
---
 core/compiler.cpp              | 6 ++++--
 core/conversion/conversion.cpp | 9 +++++++++
 core/conversion/conversion.h   | 2 ++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 57b4667bce..72243835dd 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -437,6 +437,7 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
 
       MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
       auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
+      auto outputIsCollection = conversion::OutputIsCollection(g->block());
       if (cfg.partition_info.enabled &&
           (cfg.lower_info.forced_fallback_modules.size() == 0 &&
            cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
@@ -444,8 +445,9 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       }
 
       if (cfg.partition_info.enabled &&
-          !(cfg.lower_info.forced_fallback_modules.size() == 0 &&
-            cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
+          (!(cfg.lower_info.forced_fallback_modules.size() == 0 &&
+            cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)
+            || outputIsCollection)) {
 
         auto collection_input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.collection_input_spec_map, first_use_types);
         auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), collection_input_ivalues_map, cfg, static_params);
diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 3c9eb0dea4..bddd8fd835 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -555,6 +555,15 @@ std::set<std::string> ConvertableOpsInBlock(const torch::jit::Block* b) {
   return convertable_ops;
 }
 
+bool OutputIsCollection(const torch::jit::Block* b) {
+  for (auto out: b->outputs()) {
+    if(out->type()->kind() == torch::jit::TypeKind::TupleType || out->type()->kind() == torch::jit::TypeKind::ListType) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors) {
   auto unsupported_ops = GetUnsupportedOpsInBlock(b);
   if (unsupported_ops.size() != 0) {
diff --git a/core/conversion/conversion.h b/core/conversion/conversion.h
index 148d99ac13..a578c4288e 100644
--- a/core/conversion/conversion.h
+++ b/core/conversion/conversion.h
@@ -26,6 +26,8 @@ std::string ConvertBlockToEngine(
 
 bool OpSupported(const torch::jit::Node* n);
 
+bool OutputIsCollection(const torch::jit::Block* b);
+
 bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors = false);
 
 c10::optional<torch::jit::IValue> EvaluateNode(

From fa6c10e7fd72ec1989e6ed6b17c352c4cb0e4655 Mon Sep 17 00:00:00 2001
From: inocsin <vcheungyi@163.com>
Date: Tue, 19 Apr 2022 10:02:54 +0800
Subject: [PATCH 22/22] fix: [collection] update
 tests/cpp/test_example_tensors.cpp

Signed-off-by: inocsin <vcheungyi@163.com>
---
 tests/cpp/test_example_tensors.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/cpp/test_example_tensors.cpp b/tests/cpp/test_example_tensors.cpp
index 6561cd16a0..7e16f47f70 100644
--- a/tests/cpp/test_example_tensors.cpp
+++ b/tests/cpp/test_example_tensors.cpp
@@ -8,8 +8,8 @@ TEST_P(CppAPITests, InputsFromTensors) {
     jit_inputs_ivalues.push_back(in.clone());
     trt_inputs_ivalues.push_back(in.clone());
   }
-
-  auto spec = torch_tensorrt::ts::CompileSpec({trt_inputs_ivalues[0].toTensor()});
+  std::vector<torch_tensorrt::Input> inputs = {trt_inputs_ivalues[0].toTensor()};
+  auto spec = torch_tensorrt::ts::CompileSpec(inputs);
 
   auto trt_mod = torch_tensorrt::ts::compile(mod, spec);
   torch::jit::IValue trt_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(trt_mod, trt_inputs_ivalues);