Qualcomm AI Engine Direct - Intermediate Tensor Dump (#5310)

winskuo-quic · facebook-github-bot · commit eecf74fb2616 · 2024-09-15T12:39:33.000-07:00
Summary: - Intermediate Tensor Dump enablement - Added UT for intermediate tensor dump Pull Request resolved: #5310 Reviewed By: kirklandsign Differential Revision: D62616471 Pulled By: cccclai fbshipit-source-id: b571c6a0a9537e3a93c3ff752b07e5f05bd1d580
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
@@ -181,7 +181,7 @@ target_link_libraries(
 )
 target_link_libraries(
   qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
-                                 executorch_no_prim_ops qcir_utils
+                                 executorch_no_prim_ops qcir_utils extension_tensor
 )
 set_target_properties(
   qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
@@ -246,6 +246,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
             qnn_executorch_header
             executorch
             qcir_utils
+            extension_tensor
   )
   target_link_libraries(
     PyQnnWrapperAdaptor PRIVATE pybind11::module pybind11::lto wrappers
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -213,8 +213,10 @@ Error QnnExecuTorchBackend::execute(
   }
 
   ET_CHECK_OR_RETURN_ERROR(
-      qnn_manager->Execute(input_tensor_structs, output_tensor_structs) ==
-          Error::Ok,
+      qnn_manager->Execute(
+          input_tensor_structs,
+          output_tensor_structs,
+          context.event_tracer()) == Error::Ok,
       Internal,
       "Fail to execute graph");
   ET_CHECK_OR_RETURN_ERROR(
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
@@ -10,6 +10,7 @@
 #include <executorch/backends/qualcomm/runtime/Utils.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
@@ -57,9 +58,7 @@ QnnManager::QnnManager(
         "backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type));
     QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str());
     QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str());
-    QNN_EXECUTORCH_LOG_INFO(
-        "tensor_dump_output_path: %s",
-        options_->tensor_dump_output_path()->c_str());
+    QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump());
     QNN_EXECUTORCH_LOG_INFO(
         "log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level()));
     QNN_EXECUTORCH_LOG_INFO(
@@ -366,7 +365,8 @@ Error QnnManager::AllocateTensor(
 
 Error QnnManager::Execute(
     const std::vector<Qnn_Tensor_t>& input_tensor_structs,
-    std::vector<Qnn_Tensor_t>& output_tensor_structs) {
+    std::vector<Qnn_Tensor_t>& output_tensor_structs,
+    EventTracer* event_tracer) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
   error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute(
@@ -377,30 +377,27 @@ Error QnnManager::Execute(
         "qnn_graph_execute failed. Error %d", QNN_GET_ERROR_CODE(error));
     return Error::Internal;
   }
-
   if (IsTensorDump()) {
     // TODO: Need to handle the graph which is partitioned.
     // Maybe we could use graph name.
-    std::string dir = options_->tensor_dump_output_path()->str() + "/Result/";
-    CreateDirectory(dir);
-    QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str());
     for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size();
          ++out_idx) {
       const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx];
-
-      std::string output_path =
-          dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw";
-
-      std::ofstream fout(output_path, std::ios::binary);
-      if (fout.fail()) {
-        QNN_EXECUTORCH_LOG_ERROR(
-            "Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name);
-        return Error::Internal;
-      }
-
-      fout.write(
-          static_cast<const char*>(QNN_VER_PTR(output_tensor)->clientBuf.data),
-          QNN_VER_PTR(output_tensor)->clientBuf.dataSize);
+      std::vector<exec_aten::SizesType> sizes(
+          QNN_VER_PTR(output_tensor)->dimensions,
+          QNN_VER_PTR(output_tensor)->dimensions +
+              QNN_VER_PTR(output_tensor)->rank);
+
+      auto dump_tensor = executorch::extension::from_blob(
+          QNN_VER_PTR(output_tensor)->clientBuf.data,
+          sizes,
+          qnn_dtype_to_scalar_type_[QNN_VER_PTR(output_tensor)->dataType]);
+
+      torch::executor::event_tracer_log_output_delegate<exec_aten::Tensor>(
+          event_tracer,
+          QNN_VER_PTR(output_tensor)->name,
+          /*delegate_debug_id=*/static_cast<torch::executor::DebugHandle>(-1),
+          *dump_tensor);
     }
   }
 
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
@@ -37,7 +37,8 @@ class QnnManager {
 
   Error Execute(
       const std::vector<Qnn_Tensor_t>& input_tensor_structs,
-      std::vector<Qnn_Tensor_t>& output_tensor_structs);
+      std::vector<Qnn_Tensor_t>& output_tensor_structs,
+      EventTracer* event_tracer);
 
   Error ProfileExecuteData(EventTracer* event_tracer);
 
@@ -52,7 +53,7 @@ class QnnManager {
   }
 
   bool IsTensorDump() {
-    return options_->tensor_dump_output_path()->size() > 0;
+    return options_->dump_intermediate_outputs();
   }
 
   bool IsNodeSupportedByBackend(
diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
-#include <iostream>
 
 namespace torch {
 namespace executor {
diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl
@@ -63,5 +63,6 @@ def define_common_targets():
             "//executorch/backends/qualcomm/aot/wrappers:wrappers",
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
+            "//executorch/extension/tensor:tensor",
         ],
     )
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
@@ -129,7 +129,7 @@ class QnnExecuTorchOptions:
     library_path: str = ""
     log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff
     online_prepare: bool = False
-    tensor_dump_output_path: str = ""
+    dump_intermediate_outputs: bool = False
     profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
     shared_buffer: bool = False
     is_from_context_binary: bool = False
diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs
@@ -164,11 +164,9 @@ table QnnExecuTorchOptions {
   /// Check if on-device graph construction. Default is false.
   online_prepare:bool;
 
-  /// Tensor dump output path. If a path is given, Delegate would write
-  /// outputs of each OP there.
-  /// In ALL cases, we don't recommend to set this option.
-  /// This option exist just for debugging some accuracy issues.
-  tensor_dump_output_path:string;
+  /// If tensor dump is enabled, all intermediate tensors output will be dumped.
+  /// This option exists for debugging accuracy issues. Default is off.
+  dump_intermediate_outputs:bool;
 
   /// Profiling level of the delegate and the backend. Default is off.
   profile_level:QnnExecuTorchProfileLevel;
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -68,7 +68,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -490,7 +490,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -604,7 +604,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -1121,7 +1121,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -1287,6 +1287,22 @@ def setUp(self):
             saver=False,
         )
 
+    def test_qnn_backend_dump_intermediate_outputs(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            dump_intermediate_outputs=True,
+        )
+        module = Relu()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+            expected_intermediate_events=3,
+        )
+
     def test_qnn_backend_skip_node_id(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -1442,6 +1458,23 @@ def setUp(self):
             saver=False,
         )
 
+    def test_qnn_backend_dump_intermediate_outputs(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            dump_intermediate_outputs=True,
+        )
+        module = Relu()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+            expected_intermediate_events=5,
+        )
+
     def test_qnn_backend_skip_node_id_partitioner(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -2720,6 +2753,7 @@ def setup_environment():
     TestQNN.oss_repo = args.oss_repo
     TestQNN.shared_buffer = args.shared_buffer
     TestQNN.enable_x86_64 = args.enable_x86_64
+    TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
     return sys.argv[:1] + ns_args
 
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -27,8 +27,7 @@
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import capture_program
-from executorch.devtools import generate_etrecord
-from executorch.devtools.inspector import Inspector
+from executorch.devtools import generate_etrecord, Inspector
 from executorch.examples.qualcomm.utils import (
     generate_inputs,
     make_output_dir,
@@ -181,13 +180,14 @@ def _save_model_and_expected_output(
 
         return input_list, ref_outputs, pte_fname
 
-    def verify_output(
+    def verify_output(  # noqa: C901
         self,
         module: torch.nn.Module,
         sample_inputs: Tuple[torch.Tensor],
         executorch_prog: ExecutorchProgram | LoweredBackendModule,
         etrecord_path: str = "etrecord.bin",
         expected_profile_events: int = -1,
+        expected_intermediate_events: int = -1,
     ):
         with tempfile.TemporaryDirectory() as tmp_dir:
             buffer = (
@@ -211,6 +211,7 @@ def verify_output(
             output_dir = f"{tmp_dir}/outputs"
             outputs = []
             etdump_path = f"{tmp_dir}/etdump.etdp"
+            debug_output_path = f"{tmp_dir}/debug_output.bin"
 
             def post_process():
                 for i, f in enumerate(sorted(os.listdir(output_dir))):
@@ -225,6 +226,16 @@ def validate_profile():
                     len(inspector.to_dataframe().index) == expected_profile_events
                 )
 
+            def validate_intermediate_tensor():
+                inspector = Inspector(
+                    etdump_path=etdump_path, debug_buffer_path=debug_output_path
+                )
+                for event_block in inspector.event_blocks:
+                    if event_block.name == "Execute":
+                        self.assertTrue(
+                            len(event_block.events) == expected_intermediate_events
+                        )
+
             if self.enable_x86_64:
                 generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list)
                 make_output_dir(output_dir)
@@ -277,6 +288,9 @@ def validate_profile():
                 # Verify the etdump
                 if expected_profile_events != -1:
                     validate_profile()
+
+                if expected_intermediate_events != -1:
+                    validate_intermediate_tensor()
             else:
                 adb = SimpleADB(
                     qnn_sdk=os.getenv("QNN_SDK_ROOT"),
@@ -287,6 +301,9 @@ def validate_profile():
                     host_id=self.host,
                     soc_model=self.model,
                     error_only=self.error_only,
+                    dump_intermediate_outputs=(
+                        True if expected_intermediate_events != -1 else False
+                    ),
                 )
                 adb.push(inputs=[sample_inputs], input_list=input_list)
                 adb.execute()
@@ -296,12 +313,20 @@ def validate_profile():
                 if expected_profile_events != -1:
                     adb.pull_etdump(etdump_path, callback=validate_profile)
 
+                if expected_intermediate_events != -1:
+                    adb.pull_debug_output(
+                        etdump_path,
+                        debug_output_path,
+                        callback=validate_intermediate_tensor,
+                    )
+
     def lower_module_and_test_output(
         self,
         module: torch.nn.Module,
         sample_inputs: Tuple[torch.Tensor],
         expected_partitions: int = 1,
         expected_profile_events: int = -1,
+        expected_intermediate_events: int = -1,
         assert_output_equal: bool = True,
         skip_node_id_set: set = None,
         skip_node_op_set: set = None,
@@ -346,11 +371,19 @@ def lower_module_and_test_output(
         etrecord_path = "etrecord.bin"
         if self.enable_profile:
             generate_etrecord(etrecord_path, edge_copy, exec_prog)
-
         # Check numerics
-        if assert_output_equal or expected_profile_events != -1:
+        if (
+            assert_output_equal
+            or expected_profile_events != -1
+            or expected_intermediate_events != -1
+        ):
             self.verify_output(
-                module, sample_inputs, exec_prog, etrecord_path, expected_profile_events
+                module,
+                sample_inputs,
+                exec_prog,
+                etrecord_path,
+                expected_profile_events,
+                expected_intermediate_events,
             )
 
     def get_qdq_module(
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
diff --git a/examples/qualcomm/qnn_intermediate_output_inspector.py b/examples/qualcomm/qnn_intermediate_output_inspector.py
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py

Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,7 @@ target_link_libraries(`
`181`	`181`	`)`
`182`	`182`	`target_link_libraries(`
`183`	`183`	`qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager`
`184`		`- executorch_no_prim_ops qcir_utils`
	`184`	`+ executorch_no_prim_ops qcir_utils extension_tensor`
`185`	`185`	`)`
`186`	`186`	`set_target_properties(`
`187`	`187`	`qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"`
`@@ -246,6 +246,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")`
`246`	`246`	`qnn_executorch_header`
`247`	`247`	`executorch`
`248`	`248`	`qcir_utils`
	`249`	`+ extension_tensor`
`249`	`250`	`)`
`250`	`251`	`target_link_libraries(`
`251`	`252`	`PyQnnWrapperAdaptor PRIVATE pybind11::module pybind11::lto wrappers`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,8 @@ class QnnManager {`
`37`	`37`
`38`	`38`	`Error Execute(`
`39`	`39`	`const std::vector<Qnn_Tensor_t>& input_tensor_structs,`
`40`		`- std::vector<Qnn_Tensor_t>& output_tensor_structs);`
	`40`	`+ std::vector<Qnn_Tensor_t>& output_tensor_structs,`
	`41`	`+ EventTracer* event_tracer);`
`41`	`42`
`42`	`43`	`Error ProfileExecuteData(EventTracer* event_tracer);`
`43`	`44`
`@@ -52,7 +53,7 @@ class QnnManager {`
`52`	`53`	`}`
`53`	`54`
`54`	`55`	`bool IsTensorDump() {`
`55`		`- return options_->tensor_dump_output_path()->size() > 0;`
	`56`	`+ return options_->dump_intermediate_outputs();`
`56`	`57`	`}`
`57`	`58`
`58`	`59`	`bool IsNodeSupportedByBackend(`
Original file line number	Diff line number	Diff line change
`@@ -63,5 +63,6 @@ def define_common_targets():`
`63`	`63`	`"//executorch/backends/qualcomm/aot/wrappers:wrappers",`
`64`	`64`	`"//executorch/runtime/backend:interface",`
`65`	`65`	`"//executorch/runtime/core:core",`
	`66`	`+ "//executorch/extension/tensor:tensor",`
`66`	`67`	`],`
`67`	`68`	`)`