Skip to content

Commit eecf74f

Browse files
winskuo-quicfacebook-github-bot
authored andcommitted
Qualcomm AI Engine Direct - Intermediate Tensor Dump (#5310)
Summary: - Intermediate Tensor Dump enablement - Added UT for intermediate tensor dump Pull Request resolved: #5310 Reviewed By: kirklandsign Differential Revision: D62616471 Pulled By: cccclai fbshipit-source-id: b571c6a0a9537e3a93c3ff752b07e5f05bd1d580
1 parent eb0cdf7 commit eecf74f

14 files changed

+236
-66
lines changed

backends/qualcomm/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ target_link_libraries(
181181
)
182182
target_link_libraries(
183183
qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
184-
executorch_no_prim_ops qcir_utils
184+
executorch_no_prim_ops qcir_utils extension_tensor
185185
)
186186
set_target_properties(
187187
qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
@@ -246,6 +246,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
246246
qnn_executorch_header
247247
executorch
248248
qcir_utils
249+
extension_tensor
249250
)
250251
target_link_libraries(
251252
PyQnnWrapperAdaptor PRIVATE pybind11::module pybind11::lto wrappers

backends/qualcomm/runtime/QnnExecuTorchBackend.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,10 @@ Error QnnExecuTorchBackend::execute(
213213
}
214214

215215
ET_CHECK_OR_RETURN_ERROR(
216-
qnn_manager->Execute(input_tensor_structs, output_tensor_structs) ==
217-
Error::Ok,
216+
qnn_manager->Execute(
217+
input_tensor_structs,
218+
output_tensor_structs,
219+
context.event_tracer()) == Error::Ok,
218220
Internal,
219221
"Fail to execute graph");
220222
ET_CHECK_OR_RETURN_ERROR(

backends/qualcomm/runtime/QnnManager.cpp

+19-22
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <executorch/backends/qualcomm/runtime/Utils.h>
1111
#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
1212
#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
13+
#include <executorch/extension/tensor/tensor.h>
1314
#include <algorithm>
1415
#include <cstdlib>
1516
#include <cstring>
@@ -57,9 +58,7 @@ QnnManager::QnnManager(
5758
"backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type));
5859
QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str());
5960
QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str());
60-
QNN_EXECUTORCH_LOG_INFO(
61-
"tensor_dump_output_path: %s",
62-
options_->tensor_dump_output_path()->c_str());
61+
QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump());
6362
QNN_EXECUTORCH_LOG_INFO(
6463
"log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level()));
6564
QNN_EXECUTORCH_LOG_INFO(
@@ -366,7 +365,8 @@ Error QnnManager::AllocateTensor(
366365

367366
Error QnnManager::Execute(
368367
const std::vector<Qnn_Tensor_t>& input_tensor_structs,
369-
std::vector<Qnn_Tensor_t>& output_tensor_structs) {
368+
std::vector<Qnn_Tensor_t>& output_tensor_structs,
369+
EventTracer* event_tracer) {
370370
Qnn_ErrorHandle_t error = QNN_SUCCESS;
371371

372372
error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute(
@@ -377,30 +377,27 @@ Error QnnManager::Execute(
377377
"qnn_graph_execute failed. Error %d", QNN_GET_ERROR_CODE(error));
378378
return Error::Internal;
379379
}
380-
381380
if (IsTensorDump()) {
382381
// TODO: Need to handle the graph which is partitioned.
383382
// Maybe we could use graph name.
384-
std::string dir = options_->tensor_dump_output_path()->str() + "/Result/";
385-
CreateDirectory(dir);
386-
QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str());
387383
for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size();
388384
++out_idx) {
389385
const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx];
390-
391-
std::string output_path =
392-
dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw";
393-
394-
std::ofstream fout(output_path, std::ios::binary);
395-
if (fout.fail()) {
396-
QNN_EXECUTORCH_LOG_ERROR(
397-
"Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name);
398-
return Error::Internal;
399-
}
400-
401-
fout.write(
402-
static_cast<const char*>(QNN_VER_PTR(output_tensor)->clientBuf.data),
403-
QNN_VER_PTR(output_tensor)->clientBuf.dataSize);
386+
std::vector<exec_aten::SizesType> sizes(
387+
QNN_VER_PTR(output_tensor)->dimensions,
388+
QNN_VER_PTR(output_tensor)->dimensions +
389+
QNN_VER_PTR(output_tensor)->rank);
390+
391+
auto dump_tensor = executorch::extension::from_blob(
392+
QNN_VER_PTR(output_tensor)->clientBuf.data,
393+
sizes,
394+
qnn_dtype_to_scalar_type_[QNN_VER_PTR(output_tensor)->dataType]);
395+
396+
torch::executor::event_tracer_log_output_delegate<exec_aten::Tensor>(
397+
event_tracer,
398+
QNN_VER_PTR(output_tensor)->name,
399+
/*delegate_debug_id=*/static_cast<torch::executor::DebugHandle>(-1),
400+
*dump_tensor);
404401
}
405402
}
406403

backends/qualcomm/runtime/QnnManager.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ class QnnManager {
3737

3838
Error Execute(
3939
const std::vector<Qnn_Tensor_t>& input_tensor_structs,
40-
std::vector<Qnn_Tensor_t>& output_tensor_structs);
40+
std::vector<Qnn_Tensor_t>& output_tensor_structs,
41+
EventTracer* event_tracer);
4142

4243
Error ProfileExecuteData(EventTracer* event_tracer);
4344

@@ -52,7 +53,7 @@ class QnnManager {
5253
}
5354

5455
bool IsTensorDump() {
55-
return options_->tensor_dump_output_path()->size() > 0;
56+
return options_->dump_intermediate_outputs();
5657
}
5758

5859
bool IsNodeSupportedByBackend(

backends/qualcomm/runtime/backends/QnnProfiler.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
*/
88

99
#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
10-
#include <iostream>
1110

1211
namespace torch {
1312
namespace executor {

backends/qualcomm/runtime/targets.bzl

+1
Original file line numberDiff line numberDiff line change
@@ -63,5 +63,6 @@ def define_common_targets():
6363
"//executorch/backends/qualcomm/aot/wrappers:wrappers",
6464
"//executorch/runtime/backend:interface",
6565
"//executorch/runtime/core:core",
66+
"//executorch/extension/tensor:tensor",
6667
],
6768
)

backends/qualcomm/serialization/qnn_compile_spec_schema.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ class QnnExecuTorchOptions:
129129
library_path: str = ""
130130
log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff
131131
online_prepare: bool = False
132-
tensor_dump_output_path: str = ""
132+
dump_intermediate_outputs: bool = False
133133
profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
134134
shared_buffer: bool = False
135135
is_from_context_binary: bool = False

backends/qualcomm/serialization/schema.fbs

+3-5
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,9 @@ table QnnExecuTorchOptions {
164164
/// Check if on-device graph construction. Default is false.
165165
online_prepare:bool;
166166

167-
/// Tensor dump output path. If a path is given, Delegate would write
168-
/// outputs of each OP there.
169-
/// In ALL cases, we don't recommend to set this option.
170-
/// This option exist just for debugging some accuracy issues.
171-
tensor_dump_output_path:string;
167+
/// If tensor dump is enabled, all intermediate tensors output will be dumped.
168+
/// This option exists for debugging accuracy issues. Default is off.
169+
dump_intermediate_outputs:bool;
172170

173171
/// Profiling level of the delegate and the backend. Default is off.
174172
profile_level:QnnExecuTorchProfileLevel;

backends/qualcomm/tests/test_qnn_delegate.py

+38-4
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def setUp(self):
6868
debug=False,
6969
saver=False,
7070
online_prepare=TestQNN.online_prepare,
71-
tensor_dump_output_path="",
71+
dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
7272
profile=TestQNN.enable_profile,
7373
shared_buffer=TestQNN.shared_buffer,
7474
)
@@ -490,7 +490,7 @@ def setUp(self):
490490
debug=False,
491491
saver=False,
492492
online_prepare=TestQNN.online_prepare,
493-
tensor_dump_output_path="",
493+
dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
494494
profile=TestQNN.enable_profile,
495495
shared_buffer=TestQNN.shared_buffer,
496496
)
@@ -604,7 +604,7 @@ def setUp(self):
604604
debug=False,
605605
saver=False,
606606
online_prepare=TestQNN.online_prepare,
607-
tensor_dump_output_path="",
607+
dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
608608
profile=TestQNN.enable_profile,
609609
shared_buffer=TestQNN.shared_buffer,
610610
)
@@ -1121,7 +1121,7 @@ def setUp(self):
11211121
debug=False,
11221122
saver=False,
11231123
online_prepare=TestQNN.online_prepare,
1124-
tensor_dump_output_path="",
1124+
dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
11251125
profile=TestQNN.enable_profile,
11261126
shared_buffer=TestQNN.shared_buffer,
11271127
)
@@ -1287,6 +1287,22 @@ def setUp(self):
12871287
saver=False,
12881288
)
12891289

1290+
def test_qnn_backend_dump_intermediate_outputs(self):
1291+
backend_options = generate_htp_compiler_spec(use_fp16=True)
1292+
TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
1293+
soc_model=self.arch_table[TestQNN.model],
1294+
backend_options=backend_options,
1295+
dump_intermediate_outputs=True,
1296+
)
1297+
module = Relu() # noqa: F405
1298+
sample_input = (torch.randn([2, 5, 1, 3]),)
1299+
self.lower_module_and_test_output(
1300+
module,
1301+
sample_input,
1302+
expected_partitions=1,
1303+
expected_intermediate_events=3,
1304+
)
1305+
12901306
def test_qnn_backend_skip_node_id(self):
12911307
module = SimpleModel() # noqa: F405
12921308
sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -1442,6 +1458,23 @@ def setUp(self):
14421458
saver=False,
14431459
)
14441460

1461+
def test_qnn_backend_dump_intermediate_outputs(self):
1462+
backend_options = generate_htp_compiler_spec(use_fp16=False)
1463+
TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
1464+
soc_model=self.arch_table[TestQNN.model],
1465+
backend_options=backend_options,
1466+
dump_intermediate_outputs=True,
1467+
)
1468+
module = Relu() # noqa: F405
1469+
sample_input = (torch.randn([2, 5, 1, 3]),)
1470+
module = self.get_qdq_module(module, sample_input)
1471+
self.lower_module_and_test_output(
1472+
module,
1473+
sample_input,
1474+
expected_partitions=1,
1475+
expected_intermediate_events=5,
1476+
)
1477+
14451478
def test_qnn_backend_skip_node_id_partitioner(self):
14461479
module = SimpleModel() # noqa: F405
14471480
sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -2720,6 +2753,7 @@ def setup_environment():
27202753
TestQNN.oss_repo = args.oss_repo
27212754
TestQNN.shared_buffer = args.shared_buffer
27222755
TestQNN.enable_x86_64 = args.enable_x86_64
2756+
TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
27232757
return sys.argv[:1] + ns_args
27242758

27252759

backends/qualcomm/tests/utils.py

+39-6
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@
2727
QcomChipset,
2828
)
2929
from executorch.backends.qualcomm.utils.utils import capture_program
30-
from executorch.devtools import generate_etrecord
31-
from executorch.devtools.inspector import Inspector
30+
from executorch.devtools import generate_etrecord, Inspector
3231
from executorch.examples.qualcomm.utils import (
3332
generate_inputs,
3433
make_output_dir,
@@ -181,13 +180,14 @@ def _save_model_and_expected_output(
181180

182181
return input_list, ref_outputs, pte_fname
183182

184-
def verify_output(
183+
def verify_output( # noqa: C901
185184
self,
186185
module: torch.nn.Module,
187186
sample_inputs: Tuple[torch.Tensor],
188187
executorch_prog: ExecutorchProgram | LoweredBackendModule,
189188
etrecord_path: str = "etrecord.bin",
190189
expected_profile_events: int = -1,
190+
expected_intermediate_events: int = -1,
191191
):
192192
with tempfile.TemporaryDirectory() as tmp_dir:
193193
buffer = (
@@ -211,6 +211,7 @@ def verify_output(
211211
output_dir = f"{tmp_dir}/outputs"
212212
outputs = []
213213
etdump_path = f"{tmp_dir}/etdump.etdp"
214+
debug_output_path = f"{tmp_dir}/debug_output.bin"
214215

215216
def post_process():
216217
for i, f in enumerate(sorted(os.listdir(output_dir))):
@@ -225,6 +226,16 @@ def validate_profile():
225226
len(inspector.to_dataframe().index) == expected_profile_events
226227
)
227228

229+
def validate_intermediate_tensor():
230+
inspector = Inspector(
231+
etdump_path=etdump_path, debug_buffer_path=debug_output_path
232+
)
233+
for event_block in inspector.event_blocks:
234+
if event_block.name == "Execute":
235+
self.assertTrue(
236+
len(event_block.events) == expected_intermediate_events
237+
)
238+
228239
if self.enable_x86_64:
229240
generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list)
230241
make_output_dir(output_dir)
@@ -277,6 +288,9 @@ def validate_profile():
277288
# Verify the etdump
278289
if expected_profile_events != -1:
279290
validate_profile()
291+
292+
if expected_intermediate_events != -1:
293+
validate_intermediate_tensor()
280294
else:
281295
adb = SimpleADB(
282296
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
@@ -287,6 +301,9 @@ def validate_profile():
287301
host_id=self.host,
288302
soc_model=self.model,
289303
error_only=self.error_only,
304+
dump_intermediate_outputs=(
305+
True if expected_intermediate_events != -1 else False
306+
),
290307
)
291308
adb.push(inputs=[sample_inputs], input_list=input_list)
292309
adb.execute()
@@ -296,12 +313,20 @@ def validate_profile():
296313
if expected_profile_events != -1:
297314
adb.pull_etdump(etdump_path, callback=validate_profile)
298315

316+
if expected_intermediate_events != -1:
317+
adb.pull_debug_output(
318+
etdump_path,
319+
debug_output_path,
320+
callback=validate_intermediate_tensor,
321+
)
322+
299323
def lower_module_and_test_output(
300324
self,
301325
module: torch.nn.Module,
302326
sample_inputs: Tuple[torch.Tensor],
303327
expected_partitions: int = 1,
304328
expected_profile_events: int = -1,
329+
expected_intermediate_events: int = -1,
305330
assert_output_equal: bool = True,
306331
skip_node_id_set: set = None,
307332
skip_node_op_set: set = None,
@@ -346,11 +371,19 @@ def lower_module_and_test_output(
346371
etrecord_path = "etrecord.bin"
347372
if self.enable_profile:
348373
generate_etrecord(etrecord_path, edge_copy, exec_prog)
349-
350374
# Check numerics
351-
if assert_output_equal or expected_profile_events != -1:
375+
if (
376+
assert_output_equal
377+
or expected_profile_events != -1
378+
or expected_intermediate_events != -1
379+
):
352380
self.verify_output(
353-
module, sample_inputs, exec_prog, etrecord_path, expected_profile_events
381+
module,
382+
sample_inputs,
383+
exec_prog,
384+
etrecord_path,
385+
expected_profile_events,
386+
expected_intermediate_events,
354387
)
355388

356389
def get_qdq_module(

0 commit comments

Comments
 (0)