From e80b2a24bed4030fbda54de8995131d3fc317946 Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Tue, 5 Nov 2024 16:33:09 +0800 Subject: [PATCH 1/3] Qualcomm AI Engine Direct - enable multiple graphs in single pte Summary: - support multiple graphs in single qnn context in runtime - helper function in aot for generating multi-method pte - enable weight sharing mechanism on HTP - support file signature for cache reuse - changes that making sure everything works as usual - test cases --- backends/qualcomm/CMakeLists.txt | 5 +- .../qualcomm/_passes/remove_redundancy.py | 1 + backends/qualcomm/aot/ir/qcir.fbs | 7 +- backends/qualcomm/aot/ir/qcir_utils.cpp | 8 +- .../aot/python/PyQnnManagerAdaptor.cpp | 14 +- .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 196 +++++++++++++-- .../qualcomm/partition/qnn_partitioner.py | 9 +- backends/qualcomm/partition/utils.py | 22 ++ backends/qualcomm/qnn_preprocess.py | 10 +- backends/qualcomm/runtime/Logging.h | 2 +- .../qualcomm/runtime/QnnExecuTorchBackend.cpp | 179 +++++-------- .../qualcomm/runtime/QnnExecuTorchBackend.h | 15 ++ backends/qualcomm/runtime/QnnManager.cpp | 235 +++++++++++++++--- backends/qualcomm/runtime/QnnManager.h | 47 +++- .../qualcomm/runtime/backends/CMakeLists.txt | 1 + .../runtime/backends/QnnBackendCache.cpp | 115 +++++---- .../runtime/backends/QnnBackendCache.h | 32 ++- .../runtime/backends/QnnBackendFactory.cpp | 4 +- .../runtime/backends/QnnBackendFactory.h | 2 +- .../runtime/backends/QnnContextCommon.cpp | 22 +- .../runtime/backends/QnnContextCommon.h | 18 +- .../runtime/backends/QnnDeviceCommon.h | 2 +- .../runtime/backends/QnnGraphCommon.cpp | 39 ++- .../runtime/backends/QnnGraphCommon.h | 41 +-- .../qualcomm/runtime/backends/QnnLogger.h | 2 +- .../backends/htpbackend/HtpBackendCache.h | 6 +- .../htpbackend/HtpContextCustomConfig.h | 2 +- .../htpbackend/HtpDeviceCustomConfig.h | 2 +- .../htpbackend/HtpDevicePlatformInfoConfig.h | 2 +- .../runtime/backends/htpbackend/HtpGraph.h | 3 +- .../htpbackend/HtpGraphCustomConfig.cpp | 7 +- .../htpbackend/HtpGraphCustomConfig.h | 5 +- .../aarch64/HtpGraphCustomConfig.cpp | 21 ++ .../x86_64/HtpContextCustomConfig.cpp | 13 +- .../x86_64/HtpGraphCustomConfig.cpp | 21 ++ .../qualcomm/serialization/qc_binary_info.fbs | 20 ++ .../{schema.fbs => qc_compiler_spec.fbs} | 17 +- ...nn_compile_spec_schema.py => qc_schema.py} | 8 + ...ec_serialize.py => qc_schema_serialize.py} | 42 ++-- backends/qualcomm/serialization/targets.bzl | 8 +- backends/qualcomm/targets.bzl | 4 +- backends/qualcomm/tests/models.py | 4 +- backends/qualcomm/tests/test_qnn_delegate.py | 121 +++++++-- backends/qualcomm/tests/utils.py | 44 ++-- backends/qualcomm/utils/utils.py | 198 +++++++++++---- .../executor_runner/qnn_executor_runner.cpp | 25 +- examples/qualcomm/oss_scripts/llama2/llama.py | 4 +- .../qualcomm/oss_scripts/llama3_2/llama.py | 4 +- .../llama/llama2/qaihub_llama2_7b.py | 40 +-- .../llama/llama3/qaihub_llama3_8b.py | 27 +- .../qaihub_scripts/llama/runner/io_memory.cpp | 13 +- .../qaihub_scripts/llama/runner/io_memory.h | 1 + .../qaihub_scripts/llama/runner/runner.cpp | 13 +- .../qaihub_scripts/llama/runner/runner.h | 1 + .../qaihub_stable_diffusion.py | 26 +- .../stable_diffusion/runner/runner.cpp | 30 ++- .../stable_diffusion/runner/runner.h | 1 + .../qualcomm/qaihub_scripts/utils/export.py | 40 ++- .../qualcomm/qaihub_scripts/utils/utils.py | 54 ++-- examples/qualcomm/scripts/export_example.py | 4 +- .../qualcomm/scripts/mobilebert_fine_tune.py | 4 +- examples/qualcomm/utils.py | 7 +- 62 files changed, 1279 insertions(+), 591 deletions(-) create mode 100644 backends/qualcomm/partition/utils.py create mode 100644 backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpGraphCustomConfig.cpp create mode 100644 backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpGraphCustomConfig.cpp create mode 100644 backends/qualcomm/serialization/qc_binary_info.fbs rename backends/qualcomm/serialization/{schema.fbs => qc_compiler_spec.fbs} (91%) rename backends/qualcomm/serialization/{qnn_compile_spec_schema.py => qc_schema.py} (95%) rename backends/qualcomm/serialization/{qnn_compile_spec_serialize.py => qc_schema_serialize.py} (50%) diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index a73b4ba85da..1f92b2d8cfd 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -74,7 +74,10 @@ include_directories( ${EXECUTORCH_SOURCE_DIR}/third-party/flatbuffers/include ) -set(_qnn_schema__srcs backends/qualcomm/serialization/schema.fbs) +set(_qnn_schema__srcs + backends/qualcomm/serialization/qc_compiler_spec.fbs + backends/qualcomm/serialization/qc_binary_info.fbs +) set(_qnn_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include") # Paths to headers generated from the .fbs files. set(_qnn_schema__outputs) diff --git a/backends/qualcomm/_passes/remove_redundancy.py b/backends/qualcomm/_passes/remove_redundancy.py index c54596f6583..825b2584ca7 100644 --- a/backends/qualcomm/_passes/remove_redundancy.py +++ b/backends/qualcomm/_passes/remove_redundancy.py @@ -20,6 +20,7 @@ class RemoveRedundancy(ExportPass): exir_ops.edge.aten.clone.default, torch.ops.aten.alias.default, exir_ops.edge.aten.alias.default, + exir_ops.edge.aten.lift_fresh_copy.default, } def __init__(self): diff --git a/backends/qualcomm/aot/ir/qcir.fbs b/backends/qualcomm/aot/ir/qcir.fbs index 2d8b1f78fec..6c16a54e0db 100755 --- a/backends/qualcomm/aot/ir/qcir.fbs +++ b/backends/qualcomm/aot/ir/qcir.fbs @@ -94,8 +94,13 @@ table Operator { } table Graph { + name: string; nodes: [Operator]; tensors: [Tensor]; } -root_type Graph; +table Context { + graphs: [Graph]; +} + +root_type Context; diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp index 153604f8d9d..8cf024ba006 100755 --- a/backends/qualcomm/aot/ir/qcir_utils.cpp +++ b/backends/qualcomm/aot/ir/qcir_utils.cpp @@ -161,9 +161,7 @@ flatbuffers::Offset ToQuantizeParam( } } break; default: - QNN_EXECUTORCH_LOG_WARN( - "QNN_QUANTIZATION_ENCODING_UNDEFINED detected: %s", - QNN_VER_PTR(tensor)->name); + // encodings are not required if lowering with floating point precision break; } return CreateQuantizeParamDirect( @@ -229,9 +227,7 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) { const_cast(param->offsets()->data()); } break; default: - QNN_EXECUTORCH_LOG_WARN( - "qcir::QuantizeType::UNDEFINED detected: %s", - tensor->name()->c_str()); + // encodings are not required if lowering with floating point precision break; } return p; diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp index f3f7b618c9d..9dc7f7159cb 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp @@ -30,16 +30,26 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) { py::class_>(m, "QnnManager") .def(py::init()) .def(py::init()) + .def(py::init()) .def("Init", &PyQnnManager::Init) .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend) - .def("Compile", &PyQnnManager::Compile) + .def("Compile", py::overload_cast<>(&PyQnnManager::Compile)) + .def( + "Compile", + py::overload_cast< + const std::string&, + std::vector>&>(&PyQnnManager::Compile)) .def("Destroy", &PyQnnManager::Destroy) .def("IsAvailable", &PyQnnManager::IsAvailable) .def("IsTensorDump", &PyQnnManager::IsTensorDump) .def("AllocateTensor", &PyQnnManager::AllocateTensor) .def("GetGraphInputs", &PyQnnManager::GetGraphInputs) .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs) - .def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize); + .def("GetGraphNames", &PyQnnManager::GetGraphNames) + .def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize) + .def( + "MakeBinaryInfo", + py::overload_cast(&PyQnnManager::MakeBinaryInfo)); } } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h index 5cfae78c353..55429f2b430 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h @@ -8,10 +8,11 @@ #pragma once #include #include +#include +#include #include #include #include -#include #include #include #include @@ -35,6 +36,7 @@ class PyQnnManager { qnn_manager_ = std::make_shared( qnn_executorch_options, qnn_executorch_context_binary_); } + // used for loading context binary directly explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin) : qnn_executorch_option_ptr_(buffer) { @@ -42,25 +44,119 @@ class PyQnnManager { qnn_executorch_option_ptr_.cast().data()); py::buffer_info info(py::buffer(ctx_bin).request()); - qnn_executorch_context_binary_.buffer = static_cast(info.ptr); + qnn_executorch_context_binary_.buffer = info.ptr; qnn_executorch_context_binary_.nbytes = info.size * info.itemsize; qnn_manager_ = std::make_shared( qnn_executorch_options, qnn_executorch_context_binary_); } + // used for loading multiple graphs in qcir + explicit PyQnnManager(const py::bytes& buffer, const py::list& qcirs) + : qnn_executorch_option_ptr_(buffer) { + auto qnn_executorch_options = GetQnnExecuTorchOptions( + qnn_executorch_option_ptr_.cast().data()); + + // merge multiple qcirs into one context with multiple graphs + std::vector> graphs; + for (size_t i = 0; i < qcirs.size(); ++i) { + py::buffer_info info(py::buffer(qcirs[i].cast()).request()); + flatbuffers::Verifier verifier_binary_info( + static_cast(info.ptr), + info.size * info.itemsize); + if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) { + QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info"); + return; + } + auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr); + + flatbuffers::Verifier verifier_qcir( + binary_info->data()->data(), binary_info->data()->size()); + if (!qcir::VerifyContextBuffer(verifier_qcir)) { + QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format"); + return; + } + auto context = qcir::GetContext(binary_info->data()->data()); + for (const auto& graph : *context->graphs()) { + std::vector> tensors; + for (const auto tensor : *graph->tensors()) { + // here we need to take a detour to merge multiple qcir flatbuffers + // outer ToTensor + // return: flatbuffers::Offset + // consume: QnnTensor, flatbuffers::FlatBufferBuilder* + // inner ToTensor + // return: QnnTensor + // consume: flatbuffers::Vector<::flatbuffers::Offset> + tensors.emplace_back(ToTensor(ToTensor(tensor), &builder_)); + } + std::vector> nodes; + for (const auto& node : *graph->nodes()) { + int32_t* inputs_ptr = const_cast(node->inputs()->data()); + int32_t* outputs_ptr = const_cast(node->outputs()->data()); + int32_t* params_ptr = const_cast(node->params()->data()); + std::vector inputs( + inputs_ptr, inputs_ptr + node->inputs()->size()); + std::vector outputs( + outputs_ptr, outputs_ptr + node->outputs()->size()); + std::vector params( + params_ptr, params_ptr + node->params()->size()); + nodes.emplace_back(qcir::CreateOperatorDirect( + builder_, + node->name()->str().c_str(), + node->package_name()->str().c_str(), + node->type_name()->str().c_str(), + &inputs, + &outputs, + ¶ms)); + } + graphs.emplace_back(qcir::CreateGraphDirect( + builder_, graph->name()->str().c_str(), &nodes, &tensors)); + } + } + + auto context = qcir::CreateContextDirect(builder_, &graphs); + builder_.Finish(context); + QnnExecuTorchContextBinary qcir_bin( + {builder_.GetBufferPointer(), builder_.GetSize()}); + + qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin); + qnn_manager_ = std::make_shared( + qnn_executorch_options, qnn_executorch_context_binary_); + } + executorch::runtime::Error Init() { return qnn_manager_->Init(); } + bool IsNodeSupportedByBackend( std::vector>& op_wrappers) { return qnn_manager_->IsNodeSupportedByBackend(op_wrappers); } + + // this method is specific for compiling multi-graphs + py::array_t Compile() { + if (qnn_manager_->CompileQcir() != Error::Ok) { + QNN_EXECUTORCH_LOG_ERROR("Fail to compile qcir"); + return py::array_t(0); + } + + // generate context binary if compilation succeded + QnnExecuTorchContextBinary binary_info; + qnn_manager_->GetContextBinary(binary_info); + // allocate py::array (to pass the result of the C++ function to Python) + auto result = py::array_t(binary_info.nbytes); + auto result_buffer = result.request(); + char* result_ptr = (char*)result_buffer.ptr; + std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes); + return result; + } + py::array_t Compile( + const std::string& graph_name, std::vector>& op_wrappers) { - QnnExecuTorchContextBinary context_binary; - flatbuffers::FlatBufferBuilder builder; + QnnExecuTorchContextBinary binary_info; - if (qnn_manager_->IsOnlinePrepare()) { + if (qnn_manager_->IsOnlinePrepare() || qnn_manager_->IsMultipleGraphs()) { + builder_.Reset(); std::vector> tensors; std::unordered_map tensor_map; @@ -74,7 +170,7 @@ class PyQnnManager { tensor_map[wrapper.get()] = i; index.push_back(i); tensors.emplace_back( - ToTensor(wrapper->CloneTensorStruct(), &builder)); + ToTensor(wrapper->CloneTensorStruct(), &builder_)); } }; @@ -112,13 +208,13 @@ class PyQnnManager { QNN_VER_PTR(t)->clientBuf.dataSize = GetDataTypeSize(QNN_VER_PTR(t)->dataType); params.push_back(tensors.size()); - tensors.emplace_back(ToTensor(t, &builder)); + tensors.emplace_back(ToTensor(t, &builder_)); } } Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig(); operators.emplace_back(qcir::CreateOperatorDirect( - builder, + builder_, QNN_VER_PTR(op_config)->name, QNN_VER_PTR(op_config)->packageName, QNN_VER_PTR(op_config)->typeName, @@ -126,24 +222,34 @@ class PyQnnManager { &outputs, ¶ms)); } - auto graph = qcir::CreateGraphDirect(builder, &operators, &tensors); - builder.Finish(graph); - context_binary.buffer = builder.GetBufferPointer(); - context_binary.nbytes = builder.GetSize(); - } else if ( - qnn_manager_->Compile(op_wrappers, context_binary) != - executorch::runtime::Error::Ok) { - return py::array_t(0); + auto graph = qcir::CreateGraphDirect( + builder_, graph_name.c_str(), &operators, &tensors); + std::vector> graphs({graph}); + auto context = qcir::CreateContextDirect(builder_, &graphs); + builder_.Finish(context); + QnnExecuTorchContextBinary qcir_binary( + {builder_.GetBufferPointer(), builder_.GetSize()}); + binary_info = MakeBinaryInfo(qcir_binary); + } else { + if (qnn_manager_->Compile(graph_name, op_wrappers) != + executorch::runtime::Error::Ok) { + QNN_EXECUTORCH_LOG_ERROR("Fail to compile QNN graph"); + return py::array_t(0); + } + if (qnn_manager_->GetContextBinary(binary_info) != + executorch::runtime::Error::Ok) { + return py::array_t(0); + } } - // allocate py::array (to pass the result of the C++ function to - // Python) - auto result = py::array_t(context_binary.nbytes); + // allocate py::array (to pass the result of the C++ function to Python) + auto result = py::array_t(binary_info.nbytes); auto result_buffer = result.request(); char* result_ptr = (char*)result_buffer.ptr; - std::memcpy(result_ptr, context_binary.buffer, context_binary.nbytes); + std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes); return result; } + void Destroy() { return qnn_manager_->Destroy(); } @@ -156,38 +262,76 @@ class PyQnnManager { return qnn_manager_->IsTensorDump(); } - executorch::runtime::Error AllocateTensor() { - return qnn_manager_->AllocateTensor(); + executorch::runtime::Error AllocateTensor(const std::string& graph_name) { + return qnn_manager_->AllocateTensor(graph_name); } - py::list GetGraphInputs() { + py::list GetGraphInputs(const std::string& graph_name) { py::list ret; for (const std::shared_ptr& input : - qnn_manager_->GetGraphInputs()) { + qnn_manager_->GetGraphInputs(graph_name)) { ret.append(PyQnnTensorWrapper(input)); } return ret; } - py::list GetGraphOutputs() { + py::list GetGraphOutputs(const std::string& graph_name) { py::list ret; for (const std::shared_ptr& output : - qnn_manager_->GetGraphOutputs()) { + qnn_manager_->GetGraphOutputs(graph_name)) { ret.append(PyQnnTensorWrapper(output)); } return ret; } + py::list GetGraphNames() { + py::list ret; + for (const std::string& graph_name : qnn_manager_->GetGraphNames()) { + ret.append(graph_name); + } + return ret; + } + uint64_t GetSpillFillBufferSize() { return qnn_manager_->GetSpillFillBufferSize(); } + py::array_t MakeBinaryInfo(const py::bytes& ctx_bin) { + py::buffer_info info(py::buffer(ctx_bin).request()); + QnnExecuTorchContextBinary binary( + {info.ptr, static_cast(info.size * info.itemsize)}); + auto binary_info = MakeBinaryInfo(binary); + auto result = py::array_t(binary_info.nbytes); + auto result_buffer = result.request(); + std::memcpy(result_buffer.ptr, binary_info.buffer, binary_info.nbytes); + return result; + } + private: + QnnExecuTorchContextBinary MakeBinaryInfo( + const QnnExecuTorchContextBinary& ctx_bin) { + auto signature = []() { + return std::to_string( + std::chrono::high_resolution_clock::now().time_since_epoch().count()); + }; + const uint8_t* base = static_cast(ctx_bin.buffer); + std::vector data(base, base + ctx_bin.nbytes); + // add signature to binary for cache reuse in runtime + builder_.Reset(); + auto binary_info = qnn_delegate::CreateBinaryInfoDirect( + builder_, signature().c_str(), &data); + builder_.Finish(binary_info); + + return QnnExecuTorchContextBinary( + {builder_.GetBufferPointer(), builder_.GetSize()}); + } + // Store the bytes object instead of a raw pointer so that this module will // keep the bytes alive. const py::bytes qnn_executorch_option_ptr_; QnnExecuTorchContextBinary qnn_executorch_context_binary_; std::shared_ptr qnn_manager_; + flatbuffers::FlatBufferBuilder builder_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py index 659bda517f0..05054cc5d8c 100644 --- a/backends/qualcomm/partition/qnn_partitioner.py +++ b/backends/qualcomm/partition/qnn_partitioner.py @@ -10,9 +10,9 @@ import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager import torch from executorch.backends.qualcomm.builders import node_visitor +from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader from executorch.backends.qualcomm.qnn_preprocess import QnnBackend from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER -from executorch.backends.qualcomm.utils.utils import generate_qnn_executorch_option from executorch.exir.backend.backend_details import CompileSpec from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import ( @@ -32,6 +32,7 @@ not_supported_operator, to_be_implemented_operator, ) +from .utils import generate_qnn_executorch_option class QnnOperatorSupport(OperatorSupportBase): @@ -63,7 +64,11 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: ) return False - if node.target in allow_list_operator: + if ( + node.target in allow_list_operator + # bypass if custom op appears + or OpContextLoader.namespace == node.target.namespace + ): return True if ( diff --git a/backends/qualcomm/partition/utils.py b/backends/qualcomm/partition/utils.py new file mode 100644 index 00000000000..88b922d4e1f --- /dev/null +++ b/backends/qualcomm/partition/utils.py @@ -0,0 +1,22 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List + +from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC + +from executorch.exir.backend.compile_spec_schema import CompileSpec + + +def generate_qnn_executorch_option( + compiler_specs: List[CompileSpec], +) -> bytes: + for compiler_spec in compiler_specs: + if compiler_spec.key == QCOM_QNN_COMPILE_SPEC: + qnn_compile_spec_buffer = compiler_spec.value + else: + raise ValueError(f"unknown compiler spec key value: {compiler_spec.key}") + return qnn_compile_spec_buffer diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py index f13d3fb55ae..0575137cbc3 100644 --- a/backends/qualcomm/qnn_preprocess.py +++ b/backends/qualcomm/qnn_preprocess.py @@ -19,7 +19,7 @@ from executorch.backends.qualcomm._passes.layout_transform import LayoutTransform from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader -from executorch.backends.qualcomm.utils.utils import generate_qnn_executorch_option +from executorch.backends.qualcomm.partition.utils import generate_qnn_executorch_option from executorch.exir.backend.backend_details import ( BackendDetails, CompileSpec, @@ -83,7 +83,7 @@ def preprocess( ) try: context_loader_target = eval( - f"torch.ops.{OpContextLoader.namespace}.{node.name}.default", + f"torch.ops.{OpContextLoader.namespace}.{node.target.__name__}", globals().update(torch.__dict__), ) assert node.target == context_loader_target, err_msg @@ -104,11 +104,13 @@ def preprocess( else: raise RuntimeError(f"{node.op} is not supported in Qnn") qnn_context_binary = qnn_manager.Compile( - [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrapper_list] + qnn_manager.GetGraphNames()[0], + [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrapper_list], ) assert len(qnn_context_binary) != 0, "Failed to generate Qnn context binary." qnn_manager.Destroy() # For now, debug_handle_map is not used by QNN ExecuTorch return PreprocessResult( - processed_bytes=bytes(qnn_context_binary), debug_handle_map={} + processed_bytes=bytes(qnn_context_binary), + debug_handle_map={}, ) diff --git a/backends/qualcomm/runtime/Logging.h b/backends/qualcomm/runtime/Logging.h index 66705de2ac3..8c0843afab9 100644 --- a/backends/qualcomm/runtime/Logging.h +++ b/backends/qualcomm/runtime/Logging.h @@ -7,7 +7,7 @@ */ #pragma once -#include +#include #include namespace executorch { namespace backends { diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp index 5a55df6da3f..ace6d5ee50c 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp @@ -6,14 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -#include #include +#include #include #include -#include + namespace executorch { namespace backends { namespace qnn { + using namespace qnn_delegate; using executorch::runtime::ArrayRef; using executorch::runtime::BackendExecutionContext; @@ -24,6 +25,7 @@ using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; using executorch::runtime::MemoryAllocator; using executorch::runtime::Result; + // ========== Public method implementations ========================= constexpr const char* QNN_COMPILE_SPEC = "qnn_compile_spec"; Result QnnExecuTorchBackend::init( @@ -45,6 +47,7 @@ Result QnnExecuTorchBackend::init( else QNN_EXECUTORCH_LOG_WARN("unknown argument: %s", compile_spec.key); } + // Create QnnManager MemoryAllocator* runtime_allocator = context.get_runtime_allocator(); QnnManager* qnn_manager = @@ -54,130 +57,39 @@ Result QnnExecuTorchBackend::init( // destructible, we must call the destructor manually in destroy(). new (qnn_manager) QnnManager(qnn_executorch_options, qnn_context_blob); + // TODO: this is a temporal solution for multi-graph support, will be + // removed once framework starts to accept runtime configuration + // --- + // check if current context binary has already been initialized + // return cached one for reducing memory footprint + std::string signature = qnn_manager->GetBinarySignature(); + auto iter = delegate_map_.find(signature); + if (iter != delegate_map_.end()) { + QNN_EXECUTORCH_LOG_INFO( + "Use cached delegate handle for current method: %s", + context.get_method_name()); + return iter->second; + } + ET_CHECK_OR_RETURN_ERROR( qnn_manager->Init() == Error::Ok, Internal, "Fail to initialize Qnn Manager"); if (qnn_manager->IsOnlinePrepare()) { - auto graph = qcir::GetGraph(qnn_context_blob.buffer); - // qcir tensors to TensorWrapper - std::vector> tensors, graph_inputs, - graph_outputs; - for (const auto& tensor : *graph->tensors()) { - tensors.emplace_back(CreateTensorWrapper(ToTensor(tensor))); - if (tensor->type() == qcir::TensorType::WRITE) { - graph_inputs.push_back(tensors.back()); - } else if (tensor->type() == qcir::TensorType::READ) { - graph_outputs.push_back(tensors.back()); - } - } - - std::vector> op_wrappers; - // qcir graph node to OpWrapper - for (const auto& node : *graph->nodes()) { - std::shared_ptr op = std::make_shared( - node->name()->str(), - node->package_name()->str(), - node->type_name()->str()); - - // qcir input tensors to OpWrapper input tensors - std::vector> inputs; - for (uint32_t index : *node->inputs()) { - inputs.push_back(tensors[index]); - } - op->AddInputTensors(inputs); - - // qcir output tensors to OpWrapper output tensors - std::vector> outputs; - for (uint32_t index : *node->outputs()) { - outputs.push_back(tensors[index]); - } - op->AddOutputTensors(outputs); - - // qcir operator param to OpWrapper param - for (uint32_t index : *node->params()) { - const auto& tensor = graph->tensors()->Get(index); - std::string name = tensor->name()->str(); - Qnn_DataType_t dtype = ToDataType(tensor->dtype()); - if (tensor->shape()->size() != 0) { - // add tensor param - op->AddTensorParam( - name, - dtype, - tensor->shape()->size(), - tensor->shape()->data(), - tensor->data()->data()); - } else { - // add scalar param - switch (dtype) { - case Qnn_DataType_t::QNN_DATATYPE_INT_32: - op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); - break; - case Qnn_DataType_t::QNN_DATATYPE_INT_16: - op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); - break; - case Qnn_DataType_t::QNN_DATATYPE_INT_8: - op->AddScalarParam( - name, dtype, static_cast(*tensor->data()->Data())); - break; - case Qnn_DataType_t::QNN_DATATYPE_UINT_32: - op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); - break; - case Qnn_DataType_t::QNN_DATATYPE_UINT_16: - op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); - break; - case Qnn_DataType_t::QNN_DATATYPE_UINT_8: - op->AddScalarParam(name, dtype, *tensor->data()->Data()); - break; - case Qnn_DataType_t::QNN_DATATYPE_FLOAT_32: - case Qnn_DataType_t::QNN_DATATYPE_FLOAT_16: - op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); - break; - case Qnn_DataType_t::QNN_DATATYPE_BOOL_8: - op->AddScalarParam(name, dtype, *tensor->data()->Data()); - break; - default: - QNN_EXECUTORCH_LOG_ERROR( - "Invalid scalar type: %s", tensor->name()->c_str()); - break; - } - } - } - op_wrappers.push_back(std::move(op)); - } - - QnnExecuTorchContextBinary context_binary; - ET_CHECK_OR_RETURN_ERROR( - qnn_manager->Compile(op_wrappers, context_binary) == Error::Ok, - Internal, - "Fail to compile graph in online prepare stage"); - ET_CHECK_OR_RETURN_ERROR( - qnn_manager->AllocateTensor(graph_inputs, graph_outputs) == Error::Ok, + qnn_manager->CompileQcir() == Error::Ok, Internal, - "Fail to allocate tensor in online prepare stage"); + "Fail to compile binary in qcir format"); } else { - ET_CHECK_OR_RETURN_ERROR( - qnn_manager->AllocateTensor() == Error::Ok, - Internal, - "Fail to allocate tensor"); + for (const std::string& graph_name : qnn_manager->GetGraphNames()) { + ET_CHECK_OR_RETURN_ERROR( + qnn_manager->AllocateTensor(graph_name) == Error::Ok, + Internal, + "Fail to allocate tensor"); + } } + add_cached_delegate(signature, qnn_manager); return qnn_manager; } @@ -185,12 +97,17 @@ Error QnnExecuTorchBackend::execute( BackendExecutionContext& context, DelegateHandle* handle, EValue** args) const { + ET_CHECK_OR_RETURN_ERROR( + delegate_map_rev_.count(handle) != 0, + Internal, + "DelegateHandle has been deleted"); QnnManager* qnn_manager = static_cast(handle); + std::string method_name = context.get_method_name(); std::vector> input_tensors = - qnn_manager->GetGraphInputs(); + qnn_manager->GetGraphInputs(method_name); std::vector> output_tensors = - qnn_manager->GetGraphOutputs(); + qnn_manager->GetGraphOutputs(method_name); std::vector input_tensor_structs; std::vector output_tensor_structs; @@ -223,13 +140,15 @@ Error QnnExecuTorchBackend::execute( ET_CHECK_OR_RETURN_ERROR( qnn_manager->Execute( + method_name, input_tensor_structs, output_tensor_structs, context.event_tracer()) == Error::Ok, Internal, "Fail to execute graph"); ET_CHECK_OR_RETURN_ERROR( - qnn_manager->ProfileExecuteData(context.event_tracer()) == Error::Ok, + qnn_manager->ProfileExecuteData(method_name, context.event_tracer()) == + Error::Ok, Internal, "Fail to profile graph"); @@ -237,9 +156,10 @@ Error QnnExecuTorchBackend::execute( } void QnnExecuTorchBackend::destroy(DelegateHandle* handle) const { - if (handle != nullptr) { + if (handle != nullptr && delegate_map_rev_.count(handle)) { QnnManager* qnn_manager = static_cast(handle); qnn_manager->Destroy(); + erase_cached_delegate(handle); } } @@ -247,6 +167,25 @@ bool QnnExecuTorchBackend::is_available() const { return true; } +void QnnExecuTorchBackend::add_cached_delegate( + const std::string& signature, + executorch::runtime::DelegateHandle* handle) const { + std::lock_guard guard(mutex_); + delegate_map_[signature] = handle; + delegate_map_rev_[handle] = signature; +} + +void QnnExecuTorchBackend::erase_cached_delegate( + executorch::runtime::DelegateHandle* handle) const { + std::lock_guard guard(mutex_); + auto iter = delegate_map_rev_.find(handle); + if (iter == delegate_map_rev_.end()) { + return; + } + delegate_map_.erase(iter->second); + delegate_map_rev_.erase(handle); +} + namespace { auto cls = QnnExecuTorchBackend(); executorch::runtime::Backend backend{"QnnBackend", &cls}; diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.h b/backends/qualcomm/runtime/QnnExecuTorchBackend.h index 70677b0009b..630067da48a 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.h +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.h @@ -11,6 +11,9 @@ #include #include +#include +#include + namespace executorch { namespace backends { namespace qnn { @@ -34,6 +37,18 @@ class QnnExecuTorchBackend final void destroy(executorch::runtime::DelegateHandle* handle) const override; bool is_available() const override; + + private: + void add_cached_delegate( + const std::string& signature, + executorch::runtime::DelegateHandle* handle) const; + void erase_cached_delegate(executorch::runtime::DelegateHandle* handle) const; + + mutable std::mutex mutex_; + mutable std::unordered_map + delegate_map_; + mutable std::unordered_map + delegate_map_rev_; }; } // namespace qnn diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 9eeb6a8a016..a4d83585f28 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -5,6 +5,9 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ + +#include +#include #include #include #include @@ -301,10 +304,14 @@ Error QnnManager::Init() { backend_params_ptr_->qnn_context_ptr_->Configure() == Error::Ok, Internal, "Fail to configure Qnn context"); - ET_CHECK_OR_RETURN_ERROR( - backend_params_ptr_->qnn_graph_ptr_->Configure() == Error::Ok, - Internal, - "Fail to configure Qnn graph"); + for (const std::string& graph_name : + backend_params_ptr_->qnn_context_ptr_->GetGraphNames()) { + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_->qnn_graph_ptr_->Configure(graph_name) == + Error::Ok, + Internal, + "Fail to configure Qnn graph"); + } backend_params_ptr_->backend_init_state_ = BackendInitializeState::INITIALIZED; } @@ -318,20 +325,22 @@ Error QnnManager::Init() { return Error::Ok; } -Error QnnManager::AllocateTensor() { +Error QnnManager::AllocateTensor(const std::string& graph_name) { std::vector input_tensors = - backend_params_ptr_->qnn_context_ptr_->GetGraphInputs(); + backend_params_ptr_->qnn_context_ptr_->GetGraphInputs(graph_name); std::vector output_tensors = - backend_params_ptr_->qnn_context_ptr_->GetGraphOutputs(); + backend_params_ptr_->qnn_context_ptr_->GetGraphOutputs(graph_name); for (auto& tensor : input_tensors) { std::shared_ptr tensor_wrapper = CreateTensorWrapper(tensor); tensor_wrapper->UpdateQnnTensorMeta(tensor); - input_tensors_.emplace_back(std::move(tensor_wrapper)); + input_tensors_[graph_name].emplace_back(std::move(tensor_wrapper)); } if (!options_->is_from_context_binary()) { std::sort( - input_tensors_.begin(), input_tensors_.end(), CompareExportedInput); + input_tensors_[graph_name].begin(), + input_tensors_[graph_name].end(), + CompareExportedInput); } for (size_t i = 0; i < output_tensors.size(); ++i) { std::shared_ptr tensor_wrapper = @@ -347,36 +356,37 @@ Error QnnManager::AllocateTensor() { if (IsTensorDump()) { tensor_wrapper->AllocateDataBuffer(); } - output_tensors_.emplace_back(std::move(tensor_wrapper)); + output_tensors_[graph_name].emplace_back(std::move(tensor_wrapper)); } return Error::Ok; } Error QnnManager::AllocateTensor( + const std::string& graph_name, std::vector>& inputs, std::vector>& outputs) { - input_tensors_ = std::move(inputs); - for (auto& output_tensor : outputs) { - if (IsTensorDump()) { - output_tensor->AllocateDataBuffer(); - } - } + input_tensors_[graph_name] = std::move(inputs); + // TODO: suuport per-tensor dump in online prepare mode + // should be achievable with some pre-process if (!options_->is_from_context_binary()) { std::sort( - input_tensors_.begin(), input_tensors_.end(), CompareExportedInput); + input_tensors_[graph_name].begin(), + input_tensors_[graph_name].end(), + CompareExportedInput); } - output_tensors_ = std::move(outputs); + output_tensors_[graph_name] = std::move(outputs); return Error::Ok; } Error QnnManager::Execute( + const std::string& graph_name, const std::vector& input_tensor_structs, std::vector& output_tensor_structs, executorch::runtime::EventTracer* event_tracer) { Qnn_ErrorHandle_t error = QNN_SUCCESS; error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute( - input_tensor_structs, output_tensor_structs); + graph_name, input_tensor_structs, output_tensor_structs); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( @@ -413,11 +423,12 @@ Error QnnManager::Execute( } Error QnnManager::ProfileExecuteData( + const std::string& graph_name, executorch::runtime::EventTracer* event_tracer) { Qnn_ErrorHandle_t error = QNN_SUCCESS; if (options_->profile_level() != QnnExecuTorchProfileLevel::kProfileOff) { - error = - backend_params_ptr_->qnn_graph_ptr_->ProfileExecuteData(event_tracer); + error = backend_params_ptr_->qnn_graph_ptr_->ProfileExecuteData( + graph_name, event_tracer); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( " Failed to profile. Error %d", QNN_GET_ERROR_CODE(error)); @@ -465,16 +476,163 @@ bool QnnManager::IsNodeSupportedByBackend( return true; } -Error QnnManager::Compile( - std::vector>& op_wrappers, +Error QnnManager::GetContextBinary( QnnExecuTorchContextBinary& qnn_executorch_context_binary) { + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_->qnn_context_ptr_->GetContextBinary( + qnn_executorch_context_binary) == Error::Ok, + Internal, + "Fail to get context binary."); + + return Error::Ok; +} + +Error QnnManager::CompileQcir() { + flatbuffers::Verifier verifier_binary_info( + static_cast(qnn_context_blob_.buffer), + qnn_context_blob_.nbytes); + if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) { + QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info"); + return Error::Internal; + } + + auto binary_info = qnn_delegate::GetBinaryInfo(qnn_context_blob_.buffer); + flatbuffers::Verifier verifier_qcir( + binary_info->data()->data(), binary_info->data()->size()); + if (!qcir::VerifyContextBuffer(verifier_qcir)) { + QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format"); + return Error::Internal; + } + + auto context = qcir::GetContext(binary_info->data()->data()); + for (const auto& graph : *context->graphs()) { + // qcir tensors to TensorWrapper + std::vector> graph_inputs, graph_outputs, + tensors; + for (const auto& tensor : *graph->tensors()) { + tensors.emplace_back(CreateTensorWrapper(ToTensor(tensor))); + if (tensor->type() == qcir::TensorType::WRITE) { + graph_inputs.push_back(tensors.back()); + } else if (tensor->type() == qcir::TensorType::READ) { + graph_outputs.push_back(tensors.back()); + } + } + std::vector> op_wrappers; + // qcir graph node to OpWrapper + for (const auto& node : *graph->nodes()) { + std::shared_ptr op = std::make_shared( + node->name()->str(), + node->package_name()->str(), + node->type_name()->str()); + + // qcir input tensors to OpWrapper input tensors + std::vector> inputs; + for (uint32_t index : *node->inputs()) { + inputs.push_back(tensors[index]); + } + op->AddInputTensors(inputs); + + // qcir output tensors to OpWrapper output tensors + std::vector> outputs; + for (uint32_t index : *node->outputs()) { + outputs.push_back(tensors[index]); + } + op->AddOutputTensors(outputs); + + // qcir operator param to OpWrapper param + for (uint32_t index : *node->params()) { + const auto& tensor = graph->tensors()->Get(index); + std::string name = tensor->name()->str(); + Qnn_DataType_t dtype = ToDataType(tensor->dtype()); + if (tensor->shape()->size() != 0) { + // add tensor param + op->AddTensorParam( + name, + dtype, + tensor->shape()->size(), + tensor->shape()->data(), + tensor->data()->data()); + } else { + // add scalar param + switch (dtype) { + case Qnn_DataType_t::QNN_DATATYPE_INT_32: + op->AddScalarParam( + name, + dtype, + *reinterpret_cast(tensor->data()->Data())); + break; + case Qnn_DataType_t::QNN_DATATYPE_INT_16: + op->AddScalarParam( + name, + dtype, + *reinterpret_cast(tensor->data()->Data())); + break; + case Qnn_DataType_t::QNN_DATATYPE_INT_8: + op->AddScalarParam( + name, dtype, static_cast(*tensor->data()->Data())); + break; + case Qnn_DataType_t::QNN_DATATYPE_UINT_32: + op->AddScalarParam( + name, + dtype, + *reinterpret_cast(tensor->data()->Data())); + break; + case Qnn_DataType_t::QNN_DATATYPE_UINT_16: + op->AddScalarParam( + name, + dtype, + *reinterpret_cast(tensor->data()->Data())); + break; + case Qnn_DataType_t::QNN_DATATYPE_UINT_8: + op->AddScalarParam(name, dtype, *tensor->data()->Data()); + break; + case Qnn_DataType_t::QNN_DATATYPE_FLOAT_32: + case Qnn_DataType_t::QNN_DATATYPE_FLOAT_16: + op->AddScalarParam( + name, + dtype, + *reinterpret_cast(tensor->data()->Data())); + break; + case Qnn_DataType_t::QNN_DATATYPE_BOOL_8: + op->AddScalarParam(name, dtype, *tensor->data()->Data()); + break; + default: + QNN_EXECUTORCH_LOG_ERROR( + "Invalid scalar type: %s", tensor->name()->c_str()); + break; + } + } + } + op_wrappers.push_back(std::move(op)); + } + + ET_CHECK_OR_RETURN_ERROR( + Compile(graph->name()->str(), op_wrappers) == Error::Ok, + Internal, + "Fail to compile graph from qcir with graph_name: %s", + graph->name()->str().c_str()); + + ET_CHECK_OR_RETURN_ERROR( + AllocateTensor(graph->name()->str(), graph_inputs, graph_outputs) == + Error::Ok, + Internal, + "Fail to allocate tensor for qcir with graph_name: %s", + graph->name()->str().c_str()); + } + + return Error::Ok; +} + +Error QnnManager::Compile( + const std::string& graph_name, + std::vector>& op_wrappers) { Qnn_ErrorHandle_t error = QNN_SUCCESS; for (std::shared_ptr& op_wrapper : op_wrappers) { for (const auto& tensor_wrapper : op_wrapper->GetInputTensors()) { ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_->qnn_graph_ptr_->EnsureTensorInQnnGraph( - tensor_wrapper) == Error::Ok, + graph_name, tensor_wrapper) == Error::Ok, Internal, "Tensor name %s isn't added to Qnn Graph", tensor_wrapper->GetName().c_str()); @@ -483,7 +641,7 @@ Error QnnManager::Compile( for (const auto& tensor_wrapper : op_wrapper->GetOutputTensors()) { ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_->qnn_graph_ptr_->EnsureTensorInQnnGraph( - tensor_wrapper) == Error::Ok, + graph_name, tensor_wrapper) == Error::Ok, Internal, "Tensor name %s isn't added to Qnn Graph", tensor_wrapper->GetName().c_str()); @@ -494,7 +652,7 @@ Error QnnManager::Compile( if (p_tensor_param != nullptr) { ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_->qnn_graph_ptr_->EnsureTensorInQnnGraph( - p_tensor_param->GetTensorWrapper()) == Error::Ok, + graph_name, p_tensor_param->GetTensorWrapper()) == Error::Ok, Internal, "Param tensor name %s isn't added to Qnn Graph", p_tensor_param->GetName().c_str()); @@ -506,7 +664,7 @@ Error QnnManager::Compile( } error = backend_params_ptr_->qnn_graph_ptr_->GraphAddNode( - op_wrapper->GetOpConfig()); + graph_name, op_wrapper->GetOpConfig()); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( "Failed to add node to Qnn Graph with error: %d", @@ -515,7 +673,7 @@ Error QnnManager::Compile( } } - error = backend_params_ptr_->qnn_graph_ptr_->GraphFinalize(); + error = backend_params_ptr_->qnn_graph_ptr_->GraphFinalize(graph_name); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( "Failed to finalize Qnn Graph with error: %d", @@ -523,17 +681,18 @@ Error QnnManager::Compile( return Error::Internal; } - // no need to generate extra context binary in online prepare scenario - if (!IsOnlinePrepare()) { - ET_CHECK_OR_RETURN_ERROR( - backend_params_ptr_->qnn_context_ptr_->GetContextBinary( - qnn_executorch_context_binary) == Error::Ok, - Internal, - "Fail to get context binary."); - } - return Error::Ok; -}; +} + +std::string QnnManager::GetBinarySignature() { + flatbuffers::Verifier verifier( + static_cast(qnn_context_blob_.buffer), + qnn_context_blob_.nbytes); + return VerifyBinaryInfoBuffer(verifier) + ? GetBinaryInfo(qnn_context_blob_.buffer)->signature()->str() + : ""; +} + } // namespace qnn } // namespace backends } // namespace executorch diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index 2b0fc09a591..0157ee58378 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -9,10 +9,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -30,17 +30,20 @@ class QnnManager { ~QnnManager(); executorch::runtime::Error Init(); - executorch::runtime::Error AllocateTensor(); + executorch::runtime::Error AllocateTensor(const std::string& graph_name); executorch::runtime::Error AllocateTensor( + const std::string& graph_name, std::vector>& inputs, std::vector>& outputs); executorch::runtime::Error Execute( + const std::string& graph_name, const std::vector& input_tensor_structs, std::vector& output_tensor_structs, executorch::runtime::EventTracer* event_tracer); executorch::runtime::Error ProfileExecuteData( + const std::string& graph_name, executorch::runtime::EventTracer* event_tracer); void Destroy(); @@ -53,6 +56,10 @@ class QnnManager { return options_->online_prepare(); } + bool IsMultipleGraphs() { + return options_->multiple_graphs(); + } + bool IsTensorDump() { return options_->dump_intermediate_outputs(); } @@ -60,10 +67,15 @@ class QnnManager { bool IsNodeSupportedByBackend( std::vector>& op_wrappers); - executorch::runtime::Error Compile( - std::vector>& op_wrappers, + executorch::runtime::Error GetContextBinary( QnnExecuTorchContextBinary& qnn_executorch_context_binary); + executorch::runtime::Error CompileQcir(); + + executorch::runtime::Error Compile( + const std::string& graph_name, + std::vector>& op_wrappers); + executorch::runtime::Error RegisterMem( void* data_ptr, const std::shared_ptr& tensor_wrapper); @@ -77,13 +89,26 @@ class QnnManager { return htp_backend_cache_ptr->GetSpillFillBufferSize(); } - std::vector> GetGraphInputs() { - return input_tensors_; + std::vector> GetGraphInputs( + const std::string& graph_name) { + return !input_tensors_.count(graph_name) + ? std::vector>() + : input_tensors_[graph_name]; } - std::vector> GetGraphOutputs() { - return output_tensors_; + + std::vector> GetGraphOutputs( + const std::string& graph_name) { + return !output_tensors_.count(graph_name) + ? std::vector>() + : output_tensors_[graph_name]; } + std::vector GetGraphNames() { + return backend_params_ptr_->qnn_context_ptr_->GetGraphNames(); + } + + std::string GetBinarySignature(); + private: executorch::runtime::Error LoadQnnLibrary(); @@ -96,8 +121,10 @@ class QnnManager { QnnImplementation qnn_loaded_backend_; std::unique_ptr logger_; const QnnExecuTorchOptions* options_; - std::vector> input_tensors_; - std::vector> output_tensors_; + std::unordered_map>> + input_tensors_; + std::unordered_map>> + output_tensors_; executorch::runtime::Error RegisterIonMem( void* data_ptr, const std::shared_ptr& tensor_wrapper); diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt index 9147d4f32a9..2df806db52c 100644 --- a/backends/qualcomm/runtime/backends/CMakeLists.txt +++ b/backends/qualcomm/runtime/backends/CMakeLists.txt @@ -91,6 +91,7 @@ target_sources( ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpGraph.cpp ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpGraphCustomConfig.h ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpGraphCustomConfig.cpp + ${HOST_ARCHITECTURE}/HtpGraphCustomConfig.cpp ) # qnn_backend diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp index 350c040b221..43cb835cfff 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp @@ -5,29 +5,30 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ + #include +#include #include + namespace executorch { namespace backends { namespace qnn { using executorch::runtime::Error; -Error QnnBackendCache::GetQnnGraphInfoFromBinary() { +Error QnnBackendCache::GetQnnGraphInfoFromBinary( + void* buffer, + uint32_t nbytes) { const QnnSystemInterface& qnn_sys_interface = qnn_sys_impl_.GetQnnSystemInterface(); std::uint32_t num_graphs; - QnnSystemContext_GraphInfo_t* graph = nullptr; + QnnSystemContext_GraphInfo_t* graphs = nullptr; const QnnSystemContext_BinaryInfo_t* binaryinfo{nullptr}; Qnn_ContextBinarySize_t binaryinfo_size = 0; Qnn_ErrorHandle_t error = QNN_SUCCESS; error = qnn_sys_interface.qnn_system_context_get_binary_info( - sys_context_handle_, - qnn_context_blob_.buffer, - qnn_context_blob_.nbytes, - &binaryinfo, - &binaryinfo_size); + sys_context_handle_, buffer, nbytes, &binaryinfo, &binaryinfo_size); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_WARN( @@ -47,45 +48,26 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() { if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { num_graphs = binaryinfo->contextBinaryInfoV1.numGraphs; - graph = binaryinfo->contextBinaryInfoV1.graphs; + graphs = binaryinfo->contextBinaryInfoV1.graphs; } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) { num_graphs = binaryinfo->contextBinaryInfoV2.numGraphs; - graph = binaryinfo->contextBinaryInfoV2.graphs; + graphs = binaryinfo->contextBinaryInfoV2.graphs; } else { QNN_EXECUTORCH_LOG_WARN( "Unknown QNN BinaryInfo version %d.", binaryinfo->version); return Error::Internal; } - if (num_graphs > 1) { - QNN_EXECUTORCH_LOG_WARN( - "The context binary contains %lu graphs. But now " - "assume that one context binary contains one graph.", - num_graphs); - return Error::Internal; - } - - // only have version_1 now - if (graph[0].version != QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) { - QNN_EXECUTORCH_LOG_WARN( - "Unknown QNN GraphInfo version %d.", graph[0].version); - return Error::Internal; - } - // get graph name from metadata - graph_name_ = graph->graphInfoV1.graphName; - - // get graph inputs from metadata - uint32_t numGraphInputs = graph->graphInfoV1.numGraphInputs; - input_tensor_structs_.reserve(numGraphInputs); - for (std::uint32_t i = 0; i < numGraphInputs; ++i) { - input_tensor_structs_.emplace_back(graph->graphInfoV1.graphInputs[i]); - } - - // get graph outputs from metadata - uint32_t numGraphOutputs = graph->graphInfoV1.numGraphOutputs; - output_tensor_structs_.reserve(numGraphOutputs); - for (std::uint32_t i = 0; i < numGraphOutputs; ++i) { - output_tensor_structs_.emplace_back(graph->graphInfoV1.graphOutputs[i]); + for (std::uint32_t i = 0; i < num_graphs; ++i) { + if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) { + RetrieveGraphInfo(graphs[i].graphInfoV1); + } else if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) { + RetrieveGraphInfo(graphs[i].graphInfoV2); + } else { + QNN_EXECUTORCH_LOG_WARN( + "Unknown QNN GraphInfo version %d.", binaryinfo->version); + return Error::Internal; + } } return Error::Ok; @@ -94,6 +76,8 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() { Error QnnBackendCache::Configure() { if (qnn_context_blob_.buffer == nullptr) { state_ = SERIALIZE; + // use aot_graph_name if we're lowering graph on host side + graph_names_.push_back(aot_graph_name_); QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE."); return Error::Ok; } @@ -123,16 +107,30 @@ Error QnnBackendCache::Configure() { // DO DESERIALIZE state_ = DESERIALIZE; QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in RESTORE MODE."); - Error status = GetQnnGraphInfoFromBinary(); + flatbuffers::Verifier verifier_binary_info( + static_cast(qnn_context_blob_.buffer), + qnn_context_blob_.nbytes); + if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) { + QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info"); + return Error::Internal; + } + + auto binary_info = GetBinaryInfo(qnn_context_blob_.buffer); + Error status = GetQnnGraphInfoFromBinary( + const_cast(binary_info->data()->data()), + binary_info->data()->size()); + if (status == Error::Internal) { // check if context binary came from flatbuffer - flatbuffers::FlatBufferBuilder builder; flatbuffers::Verifier verifier( - static_cast(qnn_context_blob_.buffer), - qnn_context_blob_.nbytes); + binary_info->data()->data(), binary_info->data()->size()); - if (qcir::VerifyGraphBuffer(verifier)) { + if (qcir::VerifyContextBuffer(verifier)) { state_ = ONLINE_PREPARE; + auto context = qcir::GetContext(binary_info->data()->data()); + for (const auto& graph : *context->graphs()) { + graph_names_.emplace_back(graph->name()->str()); + } return Error::Ok; } @@ -159,19 +157,42 @@ QnnBackendCache::~QnnBackendCache() { qnn_sys_impl_.Unload(); } -std::vector QnnBackendCache::GetGraphInputs() { +std::vector QnnBackendCache::GetGraphInputs( + const std::string& graph_name) { if (state_ != DESERIALIZE) return {}; - return input_tensor_structs_; + return input_tensor_structs_[graph_name]; } -std::vector QnnBackendCache::GetGraphOutputs() { +std::vector QnnBackendCache::GetGraphOutputs( + const std::string& graph_name) { if (state_ != DESERIALIZE) return {}; - return output_tensor_structs_; + return output_tensor_structs_[graph_name]; +} + +template +void QnnBackendCache::RetrieveGraphInfo(const INFO& info) { + // get graph name from metadata + graph_names_.push_back(info.graphName); + // get graph inputs from metadata + uint32_t numGraphInputs = info.numGraphInputs; + input_tensor_structs_[graph_names_.back()].reserve(numGraphInputs); + for (std::uint32_t i = 0; i < numGraphInputs; ++i) { + input_tensor_structs_[graph_names_.back()].emplace_back( + info.graphInputs[i]); + } + // get graph outputs from metadata + uint32_t numGraphOutputs = info.numGraphOutputs; + output_tensor_structs_[graph_names_.back()].reserve(numGraphOutputs); + for (std::uint32_t i = 0; i < numGraphOutputs; ++i) { + output_tensor_structs_[graph_names_.back()].emplace_back( + info.graphOutputs[i]); + } } + } // namespace qnn } // namespace backends } // namespace executorch diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h index 26af927fbd8..b9e00f0a662 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.h +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h @@ -11,7 +11,9 @@ #include #include +#include #include + namespace executorch { namespace backends { namespace qnn { @@ -23,17 +25,19 @@ class QnnBackendCache { DESERIALIZE = 2, ONLINE_PREPARE = 3, }; - explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob) - : qnn_context_blob_(qnn_context_blob) {} + explicit QnnBackendCache( + const QnnExecuTorchContextBinary& qnn_context_blob, + const std::string& aot_graph_name) + : qnn_context_blob_(qnn_context_blob), aot_graph_name_(aot_graph_name) {} virtual ~QnnBackendCache(); QnnBackendCache(const QnnBackendCache&) = delete; QnnBackendCache(QnnBackendCache&&) = delete; QnnBackendCache& operator=(const QnnBackendCache&) = delete; QnnBackendCache& operator=(QnnBackendCache&&) = delete; - std::vector GetGraphInputs(); + std::vector GetGraphInputs(const std::string& graph_name); - std::vector GetGraphOutputs(); + std::vector GetGraphOutputs(const std::string& graph_name); const QnnExecuTorchContextBinary& GetQnnContextBlob() { return qnn_context_blob_; @@ -47,8 +51,8 @@ class QnnBackendCache { state_ = INVALID; } - std::string GetGraphName() { - return graph_name_; + std::vector GetGraphNames() { + return graph_names_; } executorch::runtime::Error Configure(); @@ -60,16 +64,24 @@ class QnnBackendCache { } private: - executorch::runtime::Error GetQnnGraphInfoFromBinary(); + executorch::runtime::Error GetQnnGraphInfoFromBinary( + void* buffer, + uint32_t nbytes); + + template + void RetrieveGraphInfo(const INFO& info); CacheState state_{INVALID}; QnnExecuTorchContextBinary qnn_context_blob_; QnnSystemContext_Handle_t sys_context_handle_{nullptr}; QnnSystemImplementation qnn_sys_impl_{"libQnnSystem.so"}; - std::string graph_name_; - std::vector input_tensor_structs_; - std::vector output_tensor_structs_; + std::vector graph_names_; + std::string aot_graph_name_; + std::unordered_map> + input_tensor_structs_; + std::unordered_map> + output_tensor_structs_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index 7b1cb2c2399..29e6686740b 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -60,7 +60,8 @@ std::unique_ptr QnnBackendFactory::Create( implementation, logger, options->soc_info(), htp_options); backend_params->qnn_backend_cache_ptr_ = - std::make_unique(qnn_context_blob); + std::make_unique( + qnn_context_blob, options->graph_name()->str()); backend_params->qnn_context_ptr_ = std::make_unique( implementation, @@ -74,7 +75,6 @@ std::unique_ptr QnnBackendFactory::Create( backend_params->qnn_backend_ptr_.get(), backend_params->qnn_context_ptr_.get(), options->profile_level(), - options->graph_name()->str(), options->soc_info(), htp_options); backend_params->qnn_mem_manager_ptr_ = std::make_unique( diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h index f4a4ebf8991..012c2cc7b5b 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.h +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.h @@ -7,6 +7,7 @@ */ #pragma once +#include #include #include #include @@ -21,7 +22,6 @@ #include #include #include -#include #include namespace executorch { diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp index 8064dd69abe..7db5164a1d5 100644 --- a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp +++ b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp @@ -7,6 +7,7 @@ */ #include + namespace executorch { namespace backends { namespace qnn { @@ -45,12 +46,13 @@ Error QnnContext::Configure() { if (cache_->GetCacheState() == QnnBackendCache::DESERIALIZE) { const QnnExecuTorchContextBinary& qnn_context_blob = cache_->GetQnnContextBlob(); + auto binary_info = GetBinaryInfo(qnn_context_blob.buffer); error = qnn_interface.qnn_context_create_from_binary( backend_->GetHandle(), device_->GetHandle(), temp_context_config.empty() ? nullptr : temp_context_config.data(), - qnn_context_blob.buffer, - qnn_context_blob.nbytes, + const_cast(binary_info->data()->data()), + binary_info->data()->size(), &handle_, /*profile=*/nullptr); if (error != QNN_SUCCESS) { @@ -92,7 +94,7 @@ Error QnnContext::GetContextBinary( Qnn_ErrorHandle_t error = qnn_interface.qnn_context_get_binary_size(handle_, &binary_size); if (error == QNN_SUCCESS) { - binary_buffer_.reserve(binary_size); + binary_buffer_.resize(binary_size); error = qnn_interface.qnn_context_get_binary( handle_, binary_buffer_.data(), binary_size, &bytes_written); if (error != QNN_SUCCESS) { @@ -110,8 +112,18 @@ Error QnnContext::GetContextBinary( binary_size); return Error::Internal; } - qnn_executorch_context_binary.buffer = binary_buffer_.data(); - qnn_executorch_context_binary.nbytes = bytes_written; + + auto signature = []() { + return std::to_string(std::chrono::high_resolution_clock::now() + .time_since_epoch() + .count()); + }; + builder_.Reset(); + auto binary_info = qnn_delegate::CreateBinaryInfoDirect( + builder_, signature().c_str(), &binary_buffer_); + builder_.Finish(binary_info); + qnn_executorch_context_binary.buffer = builder_.GetBufferPointer(); + qnn_executorch_context_binary.nbytes = builder_.GetSize(); } } else { QNN_EXECUTORCH_LOG_ERROR( diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h index 970b3901c6b..d93390a5379 100644 --- a/backends/qualcomm/runtime/backends/QnnContextCommon.h +++ b/backends/qualcomm/runtime/backends/QnnContextCommon.h @@ -7,6 +7,7 @@ */ #pragma once +#include #include #include #include @@ -36,15 +37,17 @@ class QnnContext { return handle_; } - std::string GetGraphName() { - return cache_->GetGraphName(); + std::vector inline GetGraphNames() { + return cache_->GetGraphNames(); } - std::vector GetGraphInputs() { - return cache_->GetGraphInputs(); + std::vector inline GetGraphInputs( + const std::string& graph_name) { + return cache_->GetGraphInputs(graph_name); } - std::vector GetGraphOutputs() { - return cache_->GetGraphOutputs(); + std::vector inline GetGraphOutputs( + const std::string& graph_name) { + return cache_->GetGraphOutputs(graph_name); } QnnBackendCache::CacheState GetCacheState() const { return cache_->GetCacheState(); @@ -68,7 +71,8 @@ class QnnContext { QnnBackend* backend_; QnnDevice* device_; QnnBackendCache* cache_; - std::vector binary_buffer_; + std::vector binary_buffer_; + flatbuffers::FlatBufferBuilder builder_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/runtime/backends/QnnDeviceCommon.h b/backends/qualcomm/runtime/backends/QnnDeviceCommon.h index a6a4cc97817..85de00f8623 100644 --- a/backends/qualcomm/runtime/backends/QnnDeviceCommon.h +++ b/backends/qualcomm/runtime/backends/QnnDeviceCommon.h @@ -7,10 +7,10 @@ */ #pragma once +#include #include #include #include -#include #include diff --git a/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp b/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp index 7215472df3d..6da525b4d02 100644 --- a/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp +++ b/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp @@ -12,7 +12,7 @@ namespace qnn { using executorch::runtime::Error; -Error QnnGraph::Configure() { +Error QnnGraph::Configure(const std::string& graph_name) { // create qnn backend const QnnInterface& qnn_interface = implementation_.GetQnnInterface(); Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -23,15 +23,22 @@ Error QnnGraph::Configure() { Internal, "Fail to make graph config."); + if (handle_.count(graph_name)) { + QNN_EXECUTORCH_LOG_ERROR( + "Graph '%s' has been configured.", graph_name.c_str()); + return Error::Ok; + } + + Qnn_GraphHandle_t graph_handle = nullptr; if (context_->GetCacheState() == QnnBackendCache::DESERIALIZE) { // retrieve QNN Graph error = qnn_interface.qnn_graph_retrieve( - context_->GetHandle(), context_->GetGraphName().c_str(), &handle_); + context_->GetHandle(), graph_name.c_str(), &graph_handle); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( "Can't retrieve graph " "%s from context. Error %d.", - context_->GetGraphName().c_str(), + graph_name.c_str(), QNN_GET_ERROR_CODE(error)); return Error::Internal; } @@ -40,9 +47,9 @@ Error QnnGraph::Configure() { context_->GetCacheState() == QnnBackendCache::ONLINE_PREPARE) { Qnn_ErrorHandle_t error = qnn_interface.qnn_graph_create( context_->GetHandle(), - graph_name_.c_str(), + graph_name.c_str(), temp_graph_config.empty() ? nullptr : temp_graph_config.data(), - &handle_); + &graph_handle); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( @@ -54,26 +61,36 @@ Error QnnGraph::Configure() { return Error::Internal; } + // book keep valid handle of created graph + handle_[graph_name] = graph_handle; // The profiler needs to be created after the backend is created. - profile_ = + profile_[graph_name] = std::make_unique(implementation_, backend_, profile_level_); return Error::Ok; } Qnn_ErrorHandle_t QnnGraph::GraphExecute( + const std::string& graph_name, const std::vector& input_tensor_structs, std::vector& output_tensor_structs) { + if (!handle_.count(graph_name)) { + QNN_EXECUTORCH_LOG_ERROR( + "graph name: %s does not exist.", graph_name.c_str()); + return QNN_COMMON_ERROR_GENERAL; + } + return implementation_.GetQnnInterface().qnn_graph_execute( - handle_, + handle_[graph_name], input_tensor_structs.data(), input_tensor_structs.size(), output_tensor_structs.data(), output_tensor_structs.size(), - profile_->GetHandle(), + profile_[graph_name]->GetHandle(), /*signalHandle=*/nullptr); }; Error QnnGraph::EnsureTensorInQnnGraph( + const std::string& graph_name, const std::shared_ptr& tensor_wrapper) { const QnnInterface& qnn_interface = implementation_.GetQnnInterface(); Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -81,7 +98,8 @@ Error QnnGraph::EnsureTensorInQnnGraph( if (!tensor_wrapper->IsTensorCreated()) { Qnn_Tensor_t tensor = tensor_wrapper->CloneTensorStruct(); - error = qnn_interface.qnn_tensor_create_graph_tensor(handle_, &tensor); + error = qnn_interface.qnn_tensor_create_graph_tensor( + handle_[graph_name], &tensor); int name_conflict_count = 0; while (error == QNN_TENSOR_ERROR_NAME_HASH_COLLISION) { @@ -99,7 +117,8 @@ Error QnnGraph::EnsureTensorInQnnGraph( // update name_conflict_count++; - error = qnn_interface.qnn_tensor_create_graph_tensor(handle_, &tensor); + error = qnn_interface.qnn_tensor_create_graph_tensor( + handle_[graph_name], &tensor); } tensor_wrapper->UpdateQnnTensorMeta(tensor); tensor_wrapper->SetTensorCreated(); diff --git a/backends/qualcomm/runtime/backends/QnnGraphCommon.h b/backends/qualcomm/runtime/backends/QnnGraphCommon.h index b8cd6c6fab8..62d9b1b9e1a 100644 --- a/backends/qualcomm/runtime/backends/QnnGraphCommon.h +++ b/backends/qualcomm/runtime/backends/QnnGraphCommon.h @@ -26,44 +26,48 @@ class QnnGraph { const QnnImplementation& implementation, QnnBackend* backend, QnnContext* context, - const QnnExecuTorchProfileLevel& profile_level, - const std::string& graph_name) - : handle_(nullptr), - implementation_(implementation), + const QnnExecuTorchProfileLevel& profile_level) + : implementation_(implementation), backend_(backend), context_(context), - profile_level_(profile_level), - graph_name_(graph_name) {} + profile_level_(profile_level) {} virtual ~QnnGraph(){}; - executorch::runtime::Error Configure(); + executorch::runtime::Error Configure(const std::string& graph_name); Qnn_ErrorHandle_t GraphExecute( + const std::string& graph_name, const std::vector& input_tensor_structs, std::vector& output_tensor_structs); - Qnn_ErrorHandle_t GraphAddNode(const Qnn_OpConfig_t& op_config) { + Qnn_ErrorHandle_t GraphAddNode( + const std::string& graph_name, + const Qnn_OpConfig_t& op_config) { return implementation_.GetQnnInterface().qnn_graph_add_node( - handle_, op_config); + handle_[graph_name], op_config); }; executorch::runtime::Error EnsureTensorInQnnGraph( + const std::string& graph_name, const std::shared_ptr& tensor_wrapper); - Qnn_ErrorHandle_t GraphFinalize() { + Qnn_ErrorHandle_t GraphFinalize(const std::string& graph_name) { return implementation_.GetQnnInterface().qnn_graph_finalize( - handle_, profile_->GetHandle(), nullptr /* signal_handle */); + handle_[graph_name], + profile_[graph_name]->GetHandle(), + nullptr /* signal_handle */); }; Qnn_ErrorHandle_t ProfileExecuteData( + const std::string& graph_name, executorch::runtime::EventTracer* event_tracer) { - return profile_->ProfileData(event_tracer); + return profile_[graph_name]->ProfileData(event_tracer); }; - Qnn_GraphHandle_t GetHandle() { - return handle_; + Qnn_GraphHandle_t GetHandle(const std::string& graph_name) { + return handle_[graph_name]; } - QnnProfile* GetProfile() { - return profile_.get(); + QnnProfile* GetProfile(const std::string& graph_name) { + return profile_[graph_name].get(); } protected: @@ -73,13 +77,12 @@ class QnnGraph { }; private: - Qnn_GraphHandle_t handle_; + std::unordered_map handle_; const QnnImplementation& implementation_; QnnBackend* backend_; QnnContext* context_; QnnExecuTorchProfileLevel profile_level_; - std::string graph_name_; - std::unique_ptr profile_; + std::unordered_map> profile_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/runtime/backends/QnnLogger.h b/backends/qualcomm/runtime/backends/QnnLogger.h index 09c74b53c60..80be4f61c59 100644 --- a/backends/qualcomm/runtime/backends/QnnLogger.h +++ b/backends/qualcomm/runtime/backends/QnnLogger.h @@ -7,8 +7,8 @@ */ #pragma once +#include #include -#include namespace executorch { namespace backends { namespace qnn { diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h index faad456aed4..4dd6897f74a 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h @@ -13,8 +13,10 @@ namespace backends { namespace qnn { class HtpBackendCache : public QnnBackendCache { public: - explicit HtpBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob) - : QnnBackendCache(qnn_context_blob), spill_fill_buf_(0) {} + explicit HtpBackendCache( + const QnnExecuTorchContextBinary& qnn_context_blob, + const std::string& aot_graph_name) + : QnnBackendCache(qnn_context_blob, aot_graph_name), spill_fill_buf_(0) {} ~HtpBackendCache() override = default; uint64_t GetSpillFillBufferSize() { diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h index a618100fcd1..f0d4873b0d2 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h @@ -8,8 +8,8 @@ #pragma once +#include #include -#include #include #include diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpDeviceCustomConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpDeviceCustomConfig.h index 93314388886..e383c4bd460 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpDeviceCustomConfig.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpDeviceCustomConfig.h @@ -7,7 +7,7 @@ */ #pragma once -#include +#include #include #include diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpDevicePlatformInfoConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpDevicePlatformInfoConfig.h index 9b75215ff8e..74d282c86e2 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpDevicePlatformInfoConfig.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpDevicePlatformInfoConfig.h @@ -7,7 +7,7 @@ */ #pragma once -#include +#include #include #include diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h b/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h index 8f0f56215d5..c3add50d08b 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h @@ -23,10 +23,9 @@ class HtpGraph : public QnnGraph { QnnBackend* backend, QnnContext* context, const QnnExecuTorchProfileLevel& profile_level, - const std::string& graph_name, const SocInfo* soc_info, const QnnExecuTorchHtpBackendOptions* htp_options) - : QnnGraph(implementation, backend, context, profile_level, graph_name), + : QnnGraph(implementation, backend, context, profile_level), qcom_target_soc_info_(soc_info), htp_options_(htp_options) { htp_graph_custom_config_ = diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.cpp index 013403fa73d..d43f8320285 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.cpp +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.cpp @@ -12,8 +12,9 @@ namespace executorch { namespace backends { namespace qnn { std::vector -HtpGraphCustomConfig::CreateGraphCustomConfig( - const SocInfo* qcom_target_soc_info) { +HtpGraphCustomConfig::CreateGraphCustomConfigCommon( + const SocInfo* qcom_target_soc_info, + float opt_level) { std::vector ret; QnnHtpGraph_CustomConfig_t* p_custom_config = nullptr; @@ -45,8 +46,6 @@ HtpGraphCustomConfig::CreateGraphCustomConfig( break; } - float opt_level = - context_->GetCacheState() == QnnBackendCache::ONLINE_PREPARE ? 1 : 3; QNN_EXECUTORCH_LOG_INFO( "Running level=%d optimization.", static_cast(opt_level)); diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h index bf038bc4b9b..4a8e78ce673 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h @@ -8,7 +8,7 @@ #pragma once #include -#include +#include #include #include @@ -35,6 +35,9 @@ class HtpGraphCustomConfig { htp_graph_config_.back()->option = QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN; return htp_graph_config_.back().get(); } + std::vector CreateGraphCustomConfigCommon( + const SocInfo* qcom_target_soc_info, + float opt_level); [[maybe_unused]] const QnnExecuTorchHtpBackendOptions* htp_options_; std::vector> htp_graph_config_; diff --git a/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpGraphCustomConfig.cpp new file mode 100644 index 00000000000..096fda7b059 --- /dev/null +++ b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpGraphCustomConfig.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch { +namespace backends { +namespace qnn { +std::vector +HtpGraphCustomConfig::CreateGraphCustomConfig( + const SocInfo* qcom_target_soc_info) { + return CreateGraphCustomConfigCommon(qcom_target_soc_info, 1); +} +} // namespace qnn +} // namespace backends +} // namespace executorch diff --git a/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpContextCustomConfig.cpp index 6b5266cf23b..1fc2940eaa7 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpContextCustomConfig.cpp +++ b/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpContextCustomConfig.cpp @@ -14,7 +14,18 @@ namespace qnn { std::vector HtpContextCustomConfig::CreateContextCustomConfig() { - return {}; + std::vector ret; + QnnHtpContext_CustomConfig_t* p_custom_config = nullptr; + + if (htp_options_->use_weight_sharing()) { + p_custom_config = AllocContextCustomConfig(); + p_custom_config->option = + QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED; + p_custom_config->weightSharingEnabled = true; + ret.push_back(static_cast(p_custom_config)); + } + + return ret; } } // namespace qnn diff --git a/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpGraphCustomConfig.cpp new file mode 100644 index 00000000000..330ca43e20b --- /dev/null +++ b/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpGraphCustomConfig.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch { +namespace backends { +namespace qnn { +std::vector +HtpGraphCustomConfig::CreateGraphCustomConfig( + const SocInfo* qcom_target_soc_info) { + return CreateGraphCustomConfigCommon(qcom_target_soc_info, 3); +} +} // namespace qnn +} // namespace backends +} // namespace executorch diff --git a/backends/qualcomm/serialization/qc_binary_info.fbs b/backends/qualcomm/serialization/qc_binary_info.fbs new file mode 100644 index 00000000000..3f301055269 --- /dev/null +++ b/backends/qualcomm/serialization/qc_binary_info.fbs @@ -0,0 +1,20 @@ +//============================================================================ +// +// Copyright (c) Qualcomm Innovation Center, Inc. +// All rights reserved +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +//============================================================================ + +namespace qnn_delegate; + +table BinaryInfo { + // Signature of binary + signature: string; + // Data of processed binary + data: [ubyte]; +} + +root_type BinaryInfo; diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs similarity index 91% rename from backends/qualcomm/serialization/schema.fbs rename to backends/qualcomm/serialization/qc_compiler_spec.fbs index e8ef108f3a6..bd097fc5ccd 100644 --- a/backends/qualcomm/serialization/schema.fbs +++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs @@ -1,8 +1,10 @@ //============================================================================ // -// Copyright (c) 2023 Qualcomm Technologies, Inc. -// All Rights Reserved. -// Confidential and Proprietary - Qualcomm Technologies, Inc. +// Copyright (c) Qualcomm Innovation Center, Inc. +// All rights reserved +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. // //============================================================================ @@ -118,6 +120,10 @@ table QnnExecuTorchHtpBackendOptions { /// pte, it is possible to reserve a single spill-fill allocation that /// could be re-used across all the splits. use_multi_contexts:bool; + + /// When multiple graphs appear inside the same context, + /// weights could be reused across all graphs. + use_weight_sharing:bool; } /// Logging level of the delegate and QNN backend. @@ -177,7 +183,10 @@ table QnnExecuTorchOptions { shared_buffer:bool; /// Is model from qnn context binary - is_from_context_binary: bool; + is_from_context_binary:bool; + + /// True if there exists multiple graphs in one .pte file. + multiple_graphs:bool; } root_type QnnExecuTorchOptions; diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qc_schema.py similarity index 95% rename from backends/qualcomm/serialization/qnn_compile_spec_schema.py rename to backends/qualcomm/serialization/qc_schema.py index ce139a54cbe..816d8134184 100644 --- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py +++ b/backends/qualcomm/serialization/qc_schema.py @@ -12,6 +12,12 @@ from enum import IntEnum, unique +@dataclass +class BinaryInfo: + signature: str = "" + data: bytes = None + + @unique class HtpArch(IntEnum): NONE = 0 @@ -98,6 +104,7 @@ class QnnExecuTorchHtpBackendOptions: use_dlbc: bool = False use_fold_relu: bool = True use_multi_contexts: bool = False + use_weight_sharing: bool = False @unique @@ -136,3 +143,4 @@ class QnnExecuTorchOptions: profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff shared_buffer: bool = False is_from_context_binary: bool = False + multiple_graphs: bool = False diff --git a/backends/qualcomm/serialization/qnn_compile_spec_serialize.py b/backends/qualcomm/serialization/qc_schema_serialize.py similarity index 50% rename from backends/qualcomm/serialization/qnn_compile_spec_serialize.py rename to backends/qualcomm/serialization/qc_schema_serialize.py index 49227628e5e..59610d7b996 100644 --- a/backends/qualcomm/serialization/qnn_compile_spec_serialize.py +++ b/backends/qualcomm/serialization/qc_schema_serialize.py @@ -9,41 +9,45 @@ import tempfile import pkg_resources -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QnnExecuTorchOptions, -) +from executorch.backends.qualcomm.serialization.qc_schema import QnnExecuTorchOptions from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile -def convert_to_flatbuffer(qnn_executorch_options: QnnExecuTorchOptions) -> bytes: - qnn_executorch_options_json = json.dumps( - qnn_executorch_options, cls=_DataclassEncoder - ) +def _convert_to_flatbuffer(obj, schema: str): + obj_json = json.dumps(obj, cls=_DataclassEncoder) with tempfile.TemporaryDirectory() as d: - schema_path = os.path.join(d, "schema.fbs") + schema_path = os.path.join(d, f"{schema}.fbs") with open(schema_path, "wb") as schema_file: - schema_file.write(pkg_resources.resource_string(__name__, "schema.fbs")) - json_path = os.path.join(d, "schema.json") + schema_file.write(pkg_resources.resource_string(__name__, f"{schema}.fbs")) + json_path = os.path.join(d, f"{schema}.json") with open(json_path, "wb") as json_file: - json_file.write(qnn_executorch_options_json.encode("ascii")) + json_file.write(obj_json.encode("ascii")) _flatc_compile(d, schema_path, json_path) - output_path = os.path.join(d, "schema.bin") + output_path = os.path.join(d, f"{schema}.bin") with open(output_path, "rb") as output_file: return output_file.read() -def convert_to_option(processed_bytes: bytes) -> QnnExecuTorchOptions: +def _convert_to_object(flatbuffers: bytes, obj_type, schema: str): with tempfile.TemporaryDirectory() as d: - json_path = os.path.join(d, "options.json") - schema_path = os.path.join(d, "schema.fbs") - bin_path = os.path.join(d, "options.bin") + json_path = os.path.join(d, f"{schema}.json") + schema_path = os.path.join(d, f"{schema}.fbs") + bin_path = os.path.join(d, f"{schema}.bin") with open(schema_path, "wb") as schema_file: - schema_file.write(pkg_resources.resource_string(__name__, "schema.fbs")) + schema_file.write(pkg_resources.resource_string(__name__, f"{schema}.fbs")) with open(bin_path, "wb") as bin_file: - bin_file.write(processed_bytes) + bin_file.write(flatbuffers) _flatc_decompile(d, schema_path, bin_path, ["--raw-binary"]) with open(json_path, "rb") as output_file: - return _json_to_dataclass(json.load(output_file), QnnExecuTorchOptions) + return _json_to_dataclass(json.load(output_file), obj_type) + + +def option_to_flatbuffer(qnn_executorch_options: QnnExecuTorchOptions) -> bytes: + return _convert_to_flatbuffer(qnn_executorch_options, "qc_compiler_spec") + + +def flatbuffer_to_option(flatbuffers: bytes) -> QnnExecuTorchOptions: + return _convert_to_object(flatbuffers, QnnExecuTorchOptions, "qc_compiler_spec") diff --git a/backends/qualcomm/serialization/targets.bzl b/backends/qualcomm/serialization/targets.bzl index c3c571109e7..c4a40ba0130 100644 --- a/backends/qualcomm/serialization/targets.bzl +++ b/backends/qualcomm/serialization/targets.bzl @@ -6,10 +6,10 @@ def define_common_targets(): The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ - + export_file( - name = "qnn_schema", - src = "schema.fbs", + name = "qc_compiler_spec_schema", + src = "qc_compiler_spec.fbs", visibility = ["//executorch/backends/qualcomm/serialization/..."], ) @@ -19,7 +19,7 @@ def define_common_targets(): "*.py", ]), resources = { - ":qnn_schema": "schema.fbs", + ":qc_compiler_spec_schema": "qc_compiler_spec.fbs", }, visibility = [ "@EXECUTORCH_CLIENTS", diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl index 929ccd97441..08d163eefc3 100644 --- a/backends/qualcomm/targets.bzl +++ b/backends/qualcomm/targets.bzl @@ -6,13 +6,13 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") # Construct the input and output file names. All input and output files rely on scalar_type file. -SCHEMA_NAME = "schema" +SCHEMA_NAME = "qc_compiler_spec" INPUT_SCHEMA = "serialization/" + SCHEMA_NAME + ".fbs" OUTPUT_SCHEMA_HEADER = SCHEMA_NAME + "_generated.h" -SCHEMA_GEN_RULE_NAME = "schema_generated" +SCHEMA_GEN_RULE_NAME = "qc_compiler_spec_generated" SCHEMA_LIRRARY_NAME = SCHEMA_NAME diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 62f3ecc3ca2..0ed66329c33 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -549,8 +549,8 @@ def forward(self, x): class Index(torch.nn.Module): def __init__(self): super().__init__() - self.idx0 = torch.tensor([[0, 1], [2, 3], [4, 5]]) - self.idx1 = torch.tensor([[1, 2], [3, 4], [5, 6]]) + self.idx0 = torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int32) + self.idx1 = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=torch.int32) def forward(self, x): return x[self.idx0] + x[self.idx1] diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 4fed86f5df8..10917cdd6bf 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -29,12 +29,13 @@ ) from executorch.backends.qualcomm.utils.utils import ( - canonicalize_program, capture_program, from_context_binary, generate_htp_compiler_spec, + generate_multi_graph_program, generate_qnn_executorch_compiler_spec, skip_annotation, + update_spill_fill_size, ) from executorch.examples.models.llama.llama_transformer import ModelArgs, MOEFeedForward @@ -1536,7 +1537,7 @@ def test_qnn_backend_multi_contexts(self): ) partitioner = QnnPartitioner(compiler_specs) edge_prog.exported_program = to_backend(edge_prog.exported_program, partitioner) - canonicalize_program(edge_prog.exported_program) + update_spill_fill_size(edge_prog.exported_program) exec_prog = edge_prog.to_executorch() self.verify_output(module, sample_input, exec_prog) @@ -1560,10 +1561,54 @@ def test_qnn_backend_multi_contexts_composite(self): edge_prog = to_edge( torch.export.export(module, sample_input), ) - canonicalize_program(edge_prog.exported_program()) + update_spill_fill_size(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) + def test_qnn_backend_multi_graphs(self): + if self.enable_x86_64: + self.skipTest("weight sharing is not supported on host machine") + + seq_conv = Conv2dSequential() # noqa: F405 + # weight sharing + modules = [seq_conv, seq_conv.second] + sample_inputs = [(torch.randn([1, 1, 3, 3]),), (torch.randn([1, 3, 3, 3]),)] + graph_names = ["seq_conv", "single_conv"] + edge_progs = [ + capture_program(module, sample_input) + for module, sample_input in zip(modules, sample_inputs) + ] + backend_options = generate_htp_compiler_spec( + use_fp16=True, + ) + compiler_specs = [ + generate_qnn_executorch_compiler_spec( + soc_model=self.chipset_table[TestQNN.model], + backend_options=backend_options, + multiple_graphs=True, + graph_name=graph_name, + ) + for graph_name in graph_names + ] + exported_programs = [ + to_backend(edge_prog.exported_program, QnnPartitioner(compiler_specs[i])) + for i, edge_prog in enumerate(edge_progs) + ] + prog_mgr = generate_multi_graph_program( + compiler_specs=compiler_specs[0], + processed_bytes=[ + prog.graph_module.lowered_module_0.processed_bytes + for prog in exported_programs + ], + ) + for index, module in enumerate(modules): + self.verify_output( + module=module, + sample_inputs=sample_inputs[index], + executorch_prog=prog_mgr, + method_index=index, + ) + def test_qnn_backend_profile_op(self): TestQNN.enable_profile = True backend_options = generate_htp_compiler_spec(use_fp16=True) @@ -1621,22 +1666,13 @@ def test_qnn_backend_context_direct(self): ) ctx_path = f"{tmp_dir}/model_ctx.bin" bundle_program = from_context_binary(ctx_path, "ctx_loader") - backend_options = generate_htp_compiler_spec(use_fp16=True) - compiler_specs = generate_qnn_executorch_compiler_spec( - soc_model=self.chipset_table[TestQNN.model], - backend_options=backend_options, - is_from_context_binary=True, - ) - lowered_module = to_backend( - "QnnBackend", bundle_program["edge_program"], compiler_specs - ) self.verify_output( module, tuple( torch.randn(size=v.shape, dtype=v.dtype) for v in bundle_program["inputs"].values() ), - lowered_module, + bundle_program["edge_program_manager"].to_executorch(), ) @@ -1819,7 +1855,7 @@ def test_qnn_backend_multi_contexts(self): ) partitioner = QnnPartitioner(compiler_specs) edge_prog.exported_program = to_backend(edge_prog.exported_program, partitioner) - canonicalize_program(edge_prog.exported_program) + update_spill_fill_size(edge_prog.exported_program) exec_prog = edge_prog.to_executorch() self.verify_output(module, sample_input, exec_prog) @@ -1844,10 +1880,54 @@ def test_qnn_backend_multi_contexts_composite(self): edge_prog = to_edge( torch.export.export(module, sample_input), ) - canonicalize_program(edge_prog.exported_program()) + update_spill_fill_size(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) + def test_qnn_backend_multi_graphs(self): + if self.enable_x86_64: + self.skipTest("weight sharing is not supported on host machine") + + seq_conv = Conv2dSequential() # noqa: F405 + # weight sharing + modules = [seq_conv, seq_conv.second] + sample_inputs = [(torch.randn([1, 1, 3, 3]),), (torch.randn([1, 3, 3, 3]),)] + graph_names = ["seq_conv", "single_conv"] + edge_progs = [ + capture_program(self.get_qdq_module(module, sample_input), sample_input) + for module, sample_input in zip(modules, sample_inputs) + ] + backend_options = generate_htp_compiler_spec( + use_fp16=True, + ) + compiler_specs = [ + generate_qnn_executorch_compiler_spec( + soc_model=self.chipset_table[TestQNN.model], + backend_options=backend_options, + multiple_graphs=True, + graph_name=graph_name, + ) + for graph_name in graph_names + ] + exported_programs = [ + to_backend(edge_prog.exported_program, QnnPartitioner(compiler_specs[i])) + for i, edge_prog in enumerate(edge_progs) + ] + prog_mgr = generate_multi_graph_program( + compiler_specs=compiler_specs[0], + processed_bytes=[ + prog.graph_module.lowered_module_0.processed_bytes + for prog in exported_programs + ], + ) + for index, module in enumerate(modules): + self.verify_output( + module=module, + sample_inputs=sample_inputs[index], + executorch_prog=prog_mgr, + method_index=index, + ) + def test_qnn_backend_profile_op(self): TestQNN.enable_profile = True backend_options = generate_htp_compiler_spec(use_fp16=False) @@ -1908,22 +1988,13 @@ def test_qnn_backend_context_direct(self): ) ctx_path = f"{tmp_dir}/model_ctx.bin" bundle_program = from_context_binary(ctx_path, "ctx_loader") - backend_options = generate_htp_compiler_spec(use_fp16=False) - compiler_specs = generate_qnn_executorch_compiler_spec( - soc_model=self.chipset_table[TestQNN.model], - backend_options=backend_options, - is_from_context_binary=True, - ) - lowered_module = to_backend( - "QnnBackend", bundle_program["edge_program"], compiler_specs - ) self.verify_output( module, tuple( torch.randn(size=v.shape, dtype=v.dtype) for v in bundle_program["inputs"].values() ), - lowered_module, + bundle_program["edge_program_manager"].to_executorch(), ) diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index d2a3e7c2417..96591eb8906 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -9,7 +9,7 @@ import subprocess import tempfile import unittest -from typing import Callable, Dict, List, Literal, Optional, Tuple +from typing import Callable, Dict, List, Optional, Tuple import numpy as np import torch @@ -18,9 +18,7 @@ from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.qnn_preprocess import QnnBackend from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.utils import ( capture_program, get_soc_to_chipset_map, @@ -35,7 +33,6 @@ from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.lowered_backend_module import LoweredBackendModule from executorch.exir.pass_base import ExportPass from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager @@ -114,19 +111,19 @@ def generate_context_binary( class TestQNN(unittest.TestCase): rtol: float = 0 atol: float = 0 - host: Literal = "" - device: Literal = "" - build_folder: Literal = "" + host: str = "" + device: str = "" + build_folder: str = "" model: QcomChipset = None compiler_specs: List[CompileSpec] = None chipset_table = get_soc_to_chipset_map() error_only = False ip = "localhost" port = 8080 - executorch_root: Literal = "" - artifact_dir: Literal = "" - image_dataset: Literal = "" - pretrained_weight: Literal = "" + executorch_root: str = "" + artifact_dir: str = "" + image_dataset: str = "" + pretrained_weight: str = "" enable_profile: bool = False online_prepare: bool = False use_8a8w: str = "8a8w" @@ -150,7 +147,7 @@ def _save_model_and_expected_output( module: torch.nn.Module, buffer: exir.ExirExportedProgram, inputs: Tuple[torch.Tensor], - dir_name: Literal, + dir_name: str, ) -> None: # Save the input data list to be executed input_list = "" @@ -181,26 +178,20 @@ def verify_output( # noqa: C901 self, module: torch.nn.Module, sample_inputs: Tuple[torch.Tensor], - executorch_prog: ExecutorchProgram | LoweredBackendModule, + executorch_prog: ExecutorchProgram | ExecutorchProgramManager, etrecord_path: str = "etrecord.bin", expected_profile_events: int = -1, expected_intermediate_events: int = -1, + method_index: int = 0, ): with tempfile.TemporaryDirectory() as tmp_dir: - buffer = ( - executorch_prog.buffer - if isinstance( - executorch_prog, (ExecutorchProgram, ExecutorchProgramManager) - ) - else executorch_prog.buffer() - ) ( input_list, ref_outputs, pte_fname, ) = self._save_model_and_expected_output( module, - buffer, + executorch_prog.buffer, sample_inputs, tmp_dir, ) @@ -253,11 +244,13 @@ def validate_intermediate_tensor(): # qnn_executor_runner f"{build_folder}/examples/qualcomm/executor_runner/qnn_executor_runner", "--model_path", - f"{pte_fname}", + pte_fname, "--input_list_path", f"{tmp_dir}/input_list.txt", "--output_folder_path", - f"{output_dir}", + output_dir, + "--method_index", + str(method_index), ] if expected_intermediate_events != -1: cmd.append("--dump_intermediate_outputs") @@ -305,7 +298,7 @@ def validate_intermediate_tensor(): ), ) adb.push(inputs=[sample_inputs], input_list=input_list) - adb.execute() + adb.execute(method_index=method_index) adb.pull(output_path=tmp_dir, callback=post_process) self._assert_outputs_equal(outputs, ref_outputs) @@ -343,7 +336,6 @@ def lower_module_and_test_output( ) exec_prog = delegated_program.to_executorch( exir.ExecutorchBackendConfig( - extract_delegate_segments=False, # For shared buffer, user must pass the memory address # which is allocated by RPC memory to executor runner. # Therefore, won't want to pre-allocate diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index cb54412add0..4bda07fdc2b 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -7,7 +7,7 @@ import operator import warnings from collections import OrderedDict -from typing import Callable, Dict, FrozenSet, List, Set, Tuple +from typing import Callable, Dict, FrozenSet, List, Tuple import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor @@ -50,7 +50,11 @@ QNN_TENSOR_TYPE_MAP, ) from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( +from executorch.backends.qualcomm.partition.qnn_partitioner import ( + generate_qnn_executorch_option, + QnnPartitioner, +) +from executorch.backends.qualcomm.serialization.qc_schema import ( _soc_info_table, HtpArch, QcomChipset, @@ -63,9 +67,9 @@ QnnExecuTorchOptions, QnnExecuTorchProfileLevel, ) -from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import ( - convert_to_flatbuffer, - convert_to_option, +from executorch.backends.qualcomm.serialization.qc_schema_serialize import ( + flatbuffer_to_option, + option_to_flatbuffer, ) from executorch.backends.qualcomm.utils.constants import ( QCOM_PASS_EXPAND_BROADCAST_SHAPE, @@ -74,8 +78,14 @@ QCOM_QUANTIZED_IO, ) -from executorch.exir import ExirExportedProgram +from executorch.exir import ( + EdgeCompileConfig, + ExecutorchProgramManager, + ExirExportedProgram, + to_edge, +) from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.capture import ExecutorchBackendConfig from executorch.exir.lowered_backend_module import LoweredBackendModule from executorch.exir.program._program import _get_updated_graph_signature from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions @@ -202,9 +212,8 @@ def replace_linear(module: torch.nn.Module): return replace_linear(module) -def canonicalize_program( +def update_spill_fill_size( exported_program: ExportedProgram | List[LoweredBackendModule], - custom_buffer_size=None, ): # check if user specifies to use multi_contexts # this is a generic approach in case there exists multiple backends @@ -213,7 +222,7 @@ def process_exported_program(prog): max_sf_buf_size, module_map = 0, {} for _, m in prog.graph_module._modules.items(): # currently only 1 compile spec is expected in each partition - options = convert_to_option(m.compile_specs[0].value) + options = flatbuffer_to_option(m.compile_specs[0].value) if ( options.backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend @@ -235,14 +244,10 @@ def process_lowered_module(module): module.compile_specs[0].value, module.processed_bytes ) assert qnn_mgr.Init().value == 0, "failed to load context binary" - spill_fill_size = ( - qnn_mgr.GetSpillFillBufferSize() - if custom_buffer_size is None - else custom_buffer_size - ) + spill_fill_size = qnn_mgr.GetSpillFillBufferSize() qnn_mgr.Destroy() return spill_fill_size, { - module: convert_to_option(module.compile_specs[0].value) + module: flatbuffer_to_option(module.compile_specs[0].value) } dispatch = { @@ -253,7 +258,7 @@ def process_lowered_module(module): def update_program(max_sf_buf_size, module_map): def set_spec(module, options): - spec = CompileSpec(QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(options)) + spec = CompileSpec(QCOM_QNN_COMPILE_SPEC, option_to_flatbuffer(options)) if isinstance(module, ExportedProgram): module.compile_specs[0] = spec else: @@ -331,11 +336,10 @@ def _transform( def capture_program( module: torch.nn.Module, inputs: Tuple[torch.Tensor], - custom_pass_config: Set[str] = frozenset(), + custom_pass_config: FrozenSet[str] = frozenset(), ) -> exir.ExirExportedProgram: ep = torch.export.export(module, inputs) decomposed_ep = ep.run_decompositions(get_decomp_table()) - # We choose call_operator by target in ConvertBinaryOpsWithScalar # because it is the same source_fn_stack for MultiheadAttention # TODO: Should modify the scalar op in the op builder instead of @@ -529,11 +533,11 @@ def skip_annotation( Returns: exported_programs: List of programs lowered to QnnBackend (quantized graphs only). """ - from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( + from executorch.backends.qualcomm.serialization.qc_schema import ( QnnExecuTorchHtpPrecision, ) - from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import ( - convert_to_option, + from executorch.backends.qualcomm.serialization.qc_schema_serialize import ( + flatbuffer_to_option, ) from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner @@ -585,14 +589,14 @@ def prepare_subgm(subgm, subgm_name): qnn_option = generate_qnn_executorch_option( partitioner.compiler_specs_snapshot ) - compile_option = convert_to_option(qnn_option) + compile_option = flatbuffer_to_option(qnn_option) htp_options = compile_option.backend_options.htp_options htp_options.precision = QnnExecuTorchHtpPrecision.kHtpFp16 partitioner.delegation_spec = DelegationSpec( "QnnBackend", [ CompileSpec( - QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(compile_option) + QCOM_QNN_COMPILE_SPEC, option_to_flatbuffer(compile_option) ) ], ) @@ -626,9 +630,14 @@ def prepare_subgm(subgm, subgm_name): return graph_module, exported_progs -def from_context_binary( - ctx_path: str, op_name: str, soc_model: QcomChipset = QcomChipset.SM8650 +def from_context_binary( # noqa: C901 + ctx_path: str | bytes, + op_name: str, + soc_model: QcomChipset = QcomChipset.SM8650, + custom_info: Dict = None, ): + from pathlib import Path + def implement_op(custom_op, op_name, outputs): @torch.library.impl( custom_op, str(op_name), dispatch_key="CompositeExplicitAutograd" @@ -661,7 +670,7 @@ def forward(self, *inputs): return { "custom_op": custom_op, "custom_module": model, - "edge_program": prog, + "exported_program": prog, } def build_tensor(tensors, dtype_map): @@ -674,8 +683,12 @@ def build_tensor(tensors, dtype_map): return ret - with open(ctx_path, "rb") as f: - ctx_bin = f.read() + def preprocess_binary(ctx_bin, compiler_specs): + qnn_mgr = PyQnnManagerAdaptor.QnnManager( + generate_qnn_executorch_option(compiler_specs), + ) + return bytes(qnn_mgr.MakeBinaryInfo(ctx_bin)) + # dummy compiler spec would be fine, since we're not compiling backend_options = generate_htp_compiler_spec(use_fp16=False) compiler_specs = generate_qnn_executorch_compiler_spec( @@ -683,26 +696,57 @@ def build_tensor(tensors, dtype_map): backend_options=backend_options, is_from_context_binary=True, ) - # get context-binary io tensor info through qnn manager - qnn_mgr = PyQnnManagerAdaptor.QnnManager( - generate_qnn_executorch_option(compiler_specs), ctx_bin + + ctx_bin = ( + ctx_path + if not isinstance(ctx_path, str) + else preprocess_binary(Path(f"{ctx_path}").read_bytes(), compiler_specs) ) - assert qnn_mgr.Init().value == 0, "failed to load context binary" - qnn_mgr.AllocateTensor() + dtype_map = {} for type_map in (QNN_QUANT_TYPE_MAP, QNN_TENSOR_TYPE_MAP): for k, v in type_map.items(): dtype_map.setdefault(v, k) - inputs = build_tensor(qnn_mgr.GetGraphInputs(), dtype_map) - outputs = build_tensor(qnn_mgr.GetGraphOutputs(), dtype_map) - qnn_mgr.Destroy() + + if custom_info is not None: + # since some context binaries might fail to open on host + # if they are compiled with special flags: + # e.g. weight sharing + # use custom information here instead + inputs = build_tensor(custom_info["graph_inputs"], dtype_map) + outputs = build_tensor(custom_info["graph_outputs"], dtype_map) + graph_name = custom_info["graph_name"] + else: + # get context-binary io tensor info through qnn manager + qnn_mgr = PyQnnManagerAdaptor.QnnManager( + generate_qnn_executorch_option(compiler_specs), + ctx_bin, + ) + assert qnn_mgr.Init().value == 0, "failed to load context binary" + # assume we only have one graph in current context + graph_name = qnn_mgr.GetGraphNames()[0] + qnn_mgr.AllocateTensor(graph_name) + inputs = build_tensor(qnn_mgr.GetGraphInputs(graph_name), dtype_map) + outputs = build_tensor(qnn_mgr.GetGraphOutputs(graph_name), dtype_map) + qnn_mgr.Destroy() + # generate graph specific for loading context bundle_prog = build_graph(inputs, outputs) bundle_prog.update({"inputs": inputs, "outputs": outputs}) - for n in bundle_prog["edge_program"].graph_module.graph.nodes: - if op_name in n.name: + edge_prog_mgr = to_edge( + programs={graph_name: bundle_prog["exported_program"]}, + # do not alter name for custom op + compile_config=EdgeCompileConfig(_use_edge_ops=False), + ) + # update meta with context binary + for n in edge_prog_mgr._edge_programs[graph_name].graph.nodes: + if n.op == "call_function" and OpContextLoader.namespace in str(n.target): n.meta[OpContextLoader.meta_ctx_bin] = ctx_bin break + + bundle_prog["edge_program_manager"] = edge_prog_mgr.to_backend( + QnnPartitioner(compiler_specs) + ) return bundle_prog @@ -712,15 +756,59 @@ def draw_graph(title, path, graph_module: torch.fx.GraphModule): f.write(graph.get_dot_graph().create_svg()) -def generate_qnn_executorch_option( +def generate_multi_graph_program( compiler_specs: List[CompileSpec], -) -> bytes: - for compiler_spec in compiler_specs: - if compiler_spec.key == QCOM_QNN_COMPILE_SPEC: - qnn_compile_spec_buffer = compiler_spec.value - else: - raise ValueError(f"unknown compiler spec key value: {compiler_spec.key}") - return qnn_compile_spec_buffer + processed_bytes: List[bytes], + backend_config: ExecutorchBackendConfig = None, +) -> ExecutorchProgramManager: + # compile multiple graphs in qcir into single context binary + graph_inputs, graph_outputs = {}, {} + qnn_mgr = PyQnnManagerAdaptor.QnnManager( + generate_qnn_executorch_option(compiler_specs), processed_bytes + ) + assert qnn_mgr.Init().value == 0, "failed to load processed bytes" + binary_info = bytes(qnn_mgr.Compile()) + assert len(binary_info) != 0, "failed to generate QNN context binary" + graph_names = qnn_mgr.GetGraphNames() + for graph_name in graph_names: + graph_inputs[graph_name] = qnn_mgr.GetGraphInputs(graph_name) + graph_outputs[graph_name] = qnn_mgr.GetGraphOutputs(graph_name) + qnn_mgr.Destroy() + + # build custom ops with different graph signatures + compiler_options = flatbuffer_to_option(compiler_specs[0].value) + bundle_progs = [ + from_context_binary( + ctx_path=binary_info, + op_name=f"loader_{graph_name}", + soc_model=compiler_options.soc_info.soc_model, + custom_info={ + "graph_inputs": graph_inputs[graph_name], + "graph_outputs": graph_outputs[graph_name], + "graph_name": graph_name, + }, + ) + for graph_name in graph_names + ] + # leverage ExecutorchProgramManager for generating pte with multi-methods + edge_prog_mgr = to_edge( + programs={ + graph_name: bundle_prog["exported_program"] + for graph_name, bundle_prog in zip(graph_names, bundle_progs) + }, + # do not alter name for custom op + compile_config=EdgeCompileConfig(_use_edge_ops=False), + ) + # restore meta losed in generating EdgeProgramManager + for graph_name in graph_names: + for n in edge_prog_mgr._edge_programs[graph_name].graph.nodes: + if graph_name in n.name: + n.meta[OpContextLoader.meta_ctx_bin] = binary_info + break + + return edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs)).to_executorch( + config=backend_config or ExecutorchBackendConfig() + ) def generate_htp_compiler_spec( @@ -773,6 +861,8 @@ def generate_qnn_executorch_compiler_spec( optrace: bool = False, shared_buffer: bool = False, is_from_context_binary: bool = False, + multiple_graphs: bool = False, + graph_name: str = "forward", ) -> List[CompileSpec]: """ Helper function generating compiler specs for Qualcomm AI Engine Direct @@ -798,6 +888,10 @@ def generate_qnn_executorch_compiler_spec( profile the performance of each operator with cycle unit. shared_buffer: Enables usage of shared buffer between application and backend for graph I/O. + is_from_context_binary: True if current graph comes from pre-built context binary. + multiple_graphs: True if multiple methods are expected to have in single .pte file. + Please see test cases for post-processing example. + graph_name: Assign unique graph name if 'multiple_graphs' is used. Returns: List[CompileSpec]: Compiler specs for Qualcomm AI Engine Direct. @@ -820,7 +914,7 @@ def generate_qnn_executorch_compiler_spec( qnn_executorch_options = QnnExecuTorchOptions( _soc_info_table[soc_model], backend_options ) - qnn_executorch_options.graph_name = "executorch" + qnn_executorch_options.graph_name = graph_name qnn_executorch_options.log_level = ( QnnExecuTorchLogLevel.kLogLevelDebug if debug @@ -854,11 +948,15 @@ def generate_qnn_executorch_compiler_spec( qnn_executorch_options.shared_buffer = shared_buffer qnn_executorch_options.online_prepare = online_prepare qnn_executorch_options.is_from_context_binary = is_from_context_binary + qnn_executorch_options.multiple_graphs = multiple_graphs + + if multiple_graphs: + # enable weight sharing mechanism if multiple graphs appear + if backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend: + backend_options.htp_options.use_weight_sharing = True return [ - CompileSpec( - QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(qnn_executorch_options) - ) + CompileSpec(QCOM_QNN_COMPILE_SPEC, option_to_flatbuffer(qnn_executorch_options)) ] diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp index 7235e36681e..7a8af5181aa 100644 --- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp +++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp @@ -50,6 +50,7 @@ DEFINE_bool( shared_buffer, false, "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend."); +DEFINE_uint32(method_index, 0, "Index of methods to be specified."); DEFINE_string( etdump_path, @@ -145,7 +146,9 @@ int main(int argc, char** argv) { const char* model_path = FLAGS_model_path.c_str(); Result loader = FileDataLoader::from(model_path); ET_CHECK_MSG( - loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error()); + loader.ok(), + "FileDataLoader::from() failed: 0x%" PRIx32, + (int)loader.error()); // Parse the program file. This is immutable, and can also be reused between // multiple execution invocations across multiple threads. @@ -156,10 +159,11 @@ int main(int argc, char** argv) { } ET_LOG(Info, "Model file %s is loaded.", model_path); - // Use the first method in the program. + // Use the designated method in the program, default to the first one const char* method_name = nullptr; { - const auto method_name_result = program->get_method_name(0); + const auto method_name_result = + program->get_method_name(FLAGS_method_index); ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); method_name = *method_name_result; } @@ -233,7 +237,7 @@ int main(int argc, char** argv) { method.ok(), "Loading of method %s failed with status 0x%" PRIx32, method_name, - method.error()); + (int)method.error()); ET_LOG(Info, "Method loaded."); void* debug_buffer; @@ -272,7 +276,7 @@ int main(int argc, char** argv) { custom_mem_ptr->GetPtr(), const_cast(tensor_meta->dim_order().data())); Error ret = method->set_input(Tensor(&impl), input_index); - ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret); + ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", (int)ret); } for (int output_index = 0; output_index < method->outputs_size(); ++output_index) { @@ -292,7 +296,9 @@ int main(int argc, char** argv) { // This can error if the outputs are already pre-allocated. Ignore // this error because it doesn't affect correctness, but log it. ET_LOG( - Info, "ignoring error from set_output_data_ptr(): 0x%" PRIx32, ret); + Info, + "ignoring error from set_output_data_ptr(): 0x%" PRIx32, + (int)ret); } } ET_LOG(Info, "Inputs prepared."); @@ -364,7 +370,8 @@ int main(int argc, char** argv) { const_cast( tensor_meta->dim_order().data())); Error ret = method->set_input(Tensor(&impl), input_index); - ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret); + ET_CHECK_MSG( + ret == Error::Ok, "Failed to set input tensor: %d", (int)ret); } Error status = Error::Ok; @@ -398,7 +405,7 @@ int main(int argc, char** argv) { status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32, method_name, - status); + (int)status); std::vector outputs(method->outputs_size()); status = method->get_outputs(outputs.data(), method->outputs_size()); @@ -443,7 +450,7 @@ int main(int argc, char** argv) { status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32, method_name, - status); + (int)status); ET_LOG(Info, "Model executed successfully."); } diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py index 875752728e6..b4a9a60c20a 100755 --- a/examples/qualcomm/oss_scripts/llama2/llama.py +++ b/examples/qualcomm/oss_scripts/llama2/llama.py @@ -17,9 +17,7 @@ from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO from executorch.backends.qualcomm.utils.utils import ( capture_program, diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py index 20d674888da..d277c9b4e77 100755 --- a/examples/qualcomm/oss_scripts/llama3_2/llama.py +++ b/examples/qualcomm/oss_scripts/llama3_2/llama.py @@ -24,9 +24,7 @@ ) from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO from executorch.backends.qualcomm.utils.utils import ( capture_program, diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py index c54b75a6b6a..3b4d6c7cbff 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py +++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py @@ -8,16 +8,14 @@ import os from multiprocessing.connection import Client -import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor import torch -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( # noqa: F401 - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.utils import ( + ExecutorchBackendConfig, from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, - generate_qnn_executorch_option, + get_soc_to_chipset_map, ) from executorch.examples.qualcomm.qaihub_scripts.utils.utils import ( gen_pte_from_ctx_bin, @@ -27,6 +25,7 @@ setup_common_args_and_variables, SimpleADB, ) +from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass def main(args): @@ -66,13 +65,27 @@ def main(args): if args.pre_gen_pte is None: # create custom operators as context loader + soc_model = get_soc_to_chipset_map()[args.model] bundle_programs = [ - from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}") + from_context_binary( + ctx_path=f"{args.context_binaries}/{target}", + op_name=f"ctx_loader_{i}", + soc_model=soc_model, + ) for i, target in enumerate(target_names) ] pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))] + memory_planning_pass = MemoryPlanningPass( + alloc_graph_input=False, + alloc_graph_output=False, + ) pte_files = gen_pte_from_ctx_bin( - args.artifact, pte_names, compiler_specs, bundle_programs + artifact=args.artifact, + pte_names=pte_names, + bundle_programs=bundle_programs, + backend_config=ExecutorchBackendConfig( + memory_planning_pass=memory_planning_pass + ), ) else: pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)] @@ -80,19 +93,6 @@ def main(args): if args.compile_only: return - def get_logit_encoding(path_to_last_shard: str): - with open(f"{args.context_binaries}/{path_to_last_shard}", "rb") as f: - ctx_bin = f.read() - qnn_mgr = PyQnnManagerAdaptor.QnnManager( - generate_qnn_executorch_option(compiler_specs), ctx_bin - ) - assert qnn_mgr.Init().value == 0, "failed to load context binary" - qnn_mgr.AllocateTensor() - logits = qnn_mgr.GetGraphOutputs()[-1] - encoding = logits.GetEncodings() - qnn_mgr.Destroy() - return encoding.data["scale"].item(), encoding.data["offset"].item() - adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=args.build_folder, diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py index 9acbeebef2d..7607c476051 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py +++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py @@ -9,14 +9,14 @@ from multiprocessing.connection import Client import torch -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( # noqa: F401 - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.utils import ( + ExecutorchBackendConfig, from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, + get_soc_to_chipset_map, ) from executorch.examples.qualcomm.qaihub_scripts.utils.utils import ( gen_pte_from_ctx_bin, @@ -26,6 +26,7 @@ setup_common_args_and_variables, SimpleADB, ) +from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass def main(args): @@ -58,22 +59,34 @@ def main(args): pte_name = "qaihub_llama3_8b_prompt" last_shard_num_inputs = 4 last_shard_num_outputs = 65 - custom_spill_fill = 128974848 else: pte_name = "qaihub_llama3_8b_token" last_shard_num_inputs = 68 last_shard_num_outputs = 65 - custom_spill_fill = 3932160 if args.pre_gen_pte is None: # create custom operators as context loader + soc_model = get_soc_to_chipset_map()[args.model] bundle_programs = [ - from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}") + from_context_binary( + ctx_path=f"{args.context_binaries}/{target}", + op_name=f"ctx_loader_{i}", + soc_model=soc_model, + ) for i, target in enumerate(target_names) ] pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))] + memory_planning_pass = MemoryPlanningPass( + alloc_graph_input=False, + alloc_graph_output=False, + ) pte_files = gen_pte_from_ctx_bin( - args.artifact, pte_names, compiler_specs, bundle_programs, custom_spill_fill + artifact=args.artifact, + pte_names=pte_names, + bundle_programs=bundle_programs, + backend_config=ExecutorchBackendConfig( + memory_planning_pass=memory_planning_pass + ), ) else: pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(5)] diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp index 9dc1ee7e254..9ee7551650a 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp @@ -29,7 +29,11 @@ Memory::Memory( input_tensors_(modules.size()), output_tensors_(modules.size()), pos_embs_path_(pos_embs_path), - modules_(modules) {} + modules_(modules) { + for (std::shared_ptr& module : modules_) { + method_names_.emplace_back(*module->method_names()->begin()); + } +} Memory::~Memory() {} @@ -436,7 +440,9 @@ void KVCachedMemory::update_io( int index = (cache_stride << 1) + (cache_group << 5) + head; ET_CHECK_MSG( modules_[shard]->set_output( - output_tensors[shard][index], index) == Error::Ok, + method_names_[shard], + output_tensors[shard][index], + index) == Error::Ok, "failed to set output tensor for module %d's %d'th output " "while updating kv_cache output tensors", shard, @@ -458,7 +464,8 @@ void KVCachedMemory::update_io( for (int shard = 0; shard < output_tensors.size(); shard++) { for (int index = 0; index < output_tensors[shard].size(); index++) { ET_CHECK_MSG( - modules_[shard]->set_output(output_tensors[shard][index], index) == + modules_[shard]->set_output( + method_names_[shard], output_tensors[shard][index], index) == Error::Ok, "failed to set output tensor for module %d's %d'th output " "while updating kv_cache output tensors", diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h index 4ad7264cc91..445be2ed21a 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h +++ b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h @@ -52,6 +52,7 @@ class Memory { std::vector> output_tensors_; std::vector pos_embs_path_; std::vector> modules_; + std::vector method_names_; }; class BertMemory : public Memory { diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp index 959f6810ae5..4bddb32b53e 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp @@ -115,7 +115,8 @@ Error Runner::load() { return Error::Ok; } for (std::shared_ptr& module : modules_) { - ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward")); + method_names_.emplace_back(*module->method_names()->begin()); + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(method_names_.back())); } // create sampler @@ -160,7 +161,8 @@ int32_t Runner::logitsToToken(const Tensor& logits_tensor) { void Runner::run_model_step(std::vector>& inputs) { for (size_t i = 0, num_modules = modules_.size(); i < num_modules; ++i) { - Result> outputs_res = modules_[i]->forward(inputs[i]); + Result> outputs_res = + modules_[i]->execute(method_names_[i], inputs[i]); ET_CHECK_MSG( outputs_res.error() == Error::Ok, "shard %zu inference failed", i); } @@ -185,7 +187,8 @@ Error Runner::generate( output_tensors.emplace_back(io_mem_->get_output_tensors(i)); for (size_t j = 0; j < output_tensors[i].size(); ++j) { ET_CHECK_MSG( - modules_[i]->set_output(output_tensors[i][j], j) == Error::Ok, + modules_[i]->set_output( + method_names_[i], output_tensors[i][j], j) == Error::Ok, "failed to set output tensor for module %d's %zu'th output", i, j); @@ -407,8 +410,8 @@ std::string statsToJsonString(const Runner::Stats& stats) { std::vector> Runner::get_methods_meta() { std::vector> methods_meta; methods_meta.reserve(modules_.size()); - for (std::shared_ptr& module : modules_) { - methods_meta.emplace_back(module->method_meta("forward")); + for (size_t i = 0; i < modules_.size(); ++i) { + methods_meta.emplace_back(modules_[i]->method_meta(method_names_[i])); } return methods_meta; } diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h index 0d15114bc64..be9af7e2275 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h @@ -98,6 +98,7 @@ class Runner { const int32_t max_seq_len_; int32_t eval_mode_; std::vector> modules_; + std::vector method_names_; std::string tokenizer_path_; float temperature_; std::unique_ptr tokenizer_; diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py index defce876ba0..8e56ce11e2e 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py @@ -13,14 +13,14 @@ import torch from diffusers import EulerDiscreteScheduler, UNet2DConditionModel from diffusers.models.embeddings import get_timestep_embedding -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, -) from executorch.backends.qualcomm.utils.utils import ( + ExecutorchBackendConfig, from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, + get_soc_to_chipset_map, + QcomChipset, ) from executorch.examples.qualcomm.qaihub_scripts.stable_diffusion.stable_diffusion_lib import ( @@ -34,6 +34,7 @@ setup_common_args_and_variables, SimpleADB, ) +from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass from PIL import Image from torchvision.transforms import ToTensor @@ -353,7 +354,6 @@ def post_process_vae(): def main(args): os.makedirs(args.artifact, exist_ok=True) - # common part for compile & inference backend_options = generate_htp_compiler_spec( use_fp16=False, @@ -367,14 +367,24 @@ def main(args): if args.pre_gen_pte is None: # Create custom operators as context loader + soc_model = get_soc_to_chipset_map()[args.model] bundle_programs = [ - from_context_binary(args.text_encoder_bin, "ctx_loader_0"), - from_context_binary(args.unet_bin, "ctx_loader_1"), - from_context_binary(args.vae_bin, "ctx_loader_2"), + from_context_binary(args.text_encoder_bin, "ctx_loader_0", soc_model), + from_context_binary(args.unet_bin, "ctx_loader_1", soc_model), + from_context_binary(args.vae_bin, "ctx_loader_2", soc_model), ] pte_names = [f"{args.pte_prefix}_{target_name}" for target_name in target_names] + memory_planning_pass = MemoryPlanningPass( + alloc_graph_input=False, + alloc_graph_output=False, + ) pte_files = gen_pte_from_ctx_bin( - args.artifact, pte_names, compiler_specs, bundle_programs + artifact=args.artifact, + pte_names=pte_names, + bundle_programs=bundle_programs, + backend_config=ExecutorchBackendConfig( + memory_planning_pass=memory_planning_pass + ), ) assert ( len(pte_files) == 3 diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp index cc54a801737..585d58b21ee 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp @@ -75,8 +75,8 @@ Runner::Runner( std::vector> Runner::get_methods_meta() { std::vector> methods_meta; - for (std::unique_ptr& module : modules_) { - methods_meta.emplace_back(module->method_meta("forward")); + for (size_t i = 0; i < modules_.size(); ++i) { + methods_meta.emplace_back(modules_[i]->method_meta(method_names_[i])); } return methods_meta; } @@ -95,7 +95,8 @@ Error Runner::load() { } stats_.model_load_start_ms = time_in_ms(); for (auto& module : modules_) { - ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward")); + method_names_.emplace_back(*module->method_names()->begin()); + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(method_names_.back())); } stats_.model_load_end_ms = time_in_ms(); return Error::Ok; @@ -378,13 +379,14 @@ Error Runner::generate(std::string prompt) { uncond_emb_vec.data(), {1, 77, 1024}, encoder_method_meta.output_tensor_meta(0)->scalar_type()); - modules_[0]->set_output(cond_emb_tensor); + auto ret = modules_[0]->set_output(method_names_[0], cond_emb_tensor); long encoder_start = time_in_ms(); - auto cond_res = modules_[0]->forward(cond_tokens_tensor); + auto cond_res = modules_[0]->execute(method_names_[0], cond_tokens_tensor); stats_.text_encoder_execution_time += (time_in_ms() - encoder_start); - modules_[0]->set_output(uncond_emb_tensor); + ret = modules_[0]->set_output(method_names_[0], uncond_emb_tensor); encoder_start = time_in_ms(); - auto uncond_res = modules_[0]->forward(uncond_tokens_tensor); + auto uncond_res = + modules_[0]->execute(method_names_[0], uncond_tokens_tensor); stats_.text_encoder_execution_time += (time_in_ms() - encoder_start); // Initialize unet parameters @@ -467,15 +469,17 @@ Error Runner::generate(std::string prompt) { stats_.unet_aggregate_post_processing_time += (time_in_ms() - start_post_process); - modules_[1]->set_output(noise_pred_text_tensor); + ret = modules_[1]->set_output(method_names_[1], noise_pred_text_tensor); long start_unet_execution = time_in_ms(); - auto cond_res = modules_[1]->forward( + auto cond_res = modules_[1]->execute( + method_names_[1], {latent_tensor, time_emb_tensors[step_index], cond_emb_tensor}); stats_.unet_aggregate_execution_time += (time_in_ms() - start_unet_execution); - modules_[1]->set_output(noise_pred_uncond_tensor); + ret = modules_[1]->set_output(method_names_[1], noise_pred_uncond_tensor); start_unet_execution = time_in_ms(); - auto uncond_res = modules_[1]->forward( + auto uncond_res = modules_[1]->execute( + method_names_[1], {latent_tensor, time_emb_tensors[step_index], uncond_emb_tensor}); // results in noise_pred_uncond_vec @@ -524,9 +528,9 @@ Error Runner::generate(std::string prompt) { quant_tensor(latent, vae_input, vae_input_scale_, vae_input_offset_); - modules_[2]->set_output(output_tensor); + ret = modules_[2]->set_output(method_names_[2], output_tensor); long start_vae_execution = time_in_ms(); - auto vae_res = modules_[2]->forward(vae_input_tensor); + auto vae_res = modules_[2]->execute(method_names_[2], vae_input_tensor); stats_.vae_execution_time = (time_in_ms() - start_vae_execution); stats_.generate_end_ms = time_in_ms(); diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h index f91efd5b832..e49201bca25 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h @@ -112,6 +112,7 @@ class Runner { private: Stats stats_; std::vector> modules_; + std::vector method_names_; std::vector> time_emb_list_; std::unordered_map vocab_to_token_map_; diff --git a/examples/qualcomm/qaihub_scripts/utils/export.py b/examples/qualcomm/qaihub_scripts/utils/export.py index b742f59f1d4..4d252175dbb 100644 --- a/examples/qualcomm/qaihub_scripts/utils/export.py +++ b/examples/qualcomm/qaihub_scripts/utils/export.py @@ -15,18 +15,17 @@ import numpy as np import torch -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.utils import ( draw_graph, + ExecutorchBackendConfig, from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, generate_qnn_executorch_option, ) +from executorch.examples.qualcomm.qaihub_scripts.utils.utils import preprocess_binary from executorch.examples.qualcomm.utils import make_output_dir, SimpleADB -from executorch.exir.backend.backend_api import to_backend from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass @@ -45,7 +44,7 @@ def get_logger(): return logging.LoggerAdapter(logger, extra={"prefix": "UTILS.EXPORT"}) -def get_io_info(prog_info, ctx_bin_path, compiler_spec): +def get_io_info(prog_info, ctx_bin_path, compiler_specs): def fill_tensor_info(info, qnn_tensors, category): # fetch related IO info stored in prog_info for i, (name, tensor) in enumerate(prog_info[category].items()): @@ -70,15 +69,16 @@ def fill_tensor_info(info, qnn_tensors, category): tensor_info = {in_key: [], out_key: []} with open(ctx_bin_path, "rb") as f: - ctx_bin = f.read() + ctx_bin = preprocess_binary(f.read(), compiler_specs) # leverage QNN pybind interface to retrieve tensor encodings qnn_mgr = PyQnnManagerAdaptor.QnnManager( - generate_qnn_executorch_option(compiler_spec), ctx_bin + generate_qnn_executorch_option(compiler_specs), ctx_bin ) assert qnn_mgr.Init().value == 0, "failed to load context binary" - qnn_mgr.AllocateTensor() - fill_tensor_info(tensor_info, qnn_mgr.GetGraphInputs(), in_key) - fill_tensor_info(tensor_info, qnn_mgr.GetGraphOutputs(), out_key) + graph_name = qnn_mgr.GetGraphNames()[0] + qnn_mgr.AllocateTensor(graph_name) + fill_tensor_info(tensor_info, qnn_mgr.GetGraphInputs(graph_name), in_key) + fill_tensor_info(tensor_info, qnn_mgr.GetGraphOutputs(graph_name), out_key) qnn_mgr.Destroy() return tensor_info @@ -250,28 +250,24 @@ def compile(args): postfix += 1 custom_op_name = f"{custom_op_name}_{postfix}" name_map[custom_op_name] = postfix - # step 1: generate ExportedProgram with custom op as binary loader + # step 1: generate ExportedProgram with custom op as binary loader & lower to QnnBackend logger.info(f"({index}/{num_bins}) exporting program for {ctx_bin}") prog_info = from_context_binary( ctx_bin, custom_op_name, getattr(QcomChipset, args.model) ) - # step 2: lower to QnnBackend - logger.info(f"({index}/{num_bins}) start lowering {ctx_bin} to QnnBackend") - lowered_module = to_backend( - "QnnBackend", prog_info["edge_program"], compiler_specs - ) - # step 3: write pte files and IO information + # step 2: write pte files and IO information logger.info(f"({index}/{num_bins}) exporting {binary_name}.pte") with open(f"{output_dir}/{binary_name}.pte", "wb") as f: - f.write( - lowered_module.buffer( - extract_delegate_segments=True, memory_planning=memory_planning_pass + prog_info["edge_program_manager"].to_executorch( + config=ExecutorchBackendConfig( + memory_planning_pass=memory_planning_pass ) - ) + ).write_to_file(f) + logger.info( f"({index}/{num_bins}) exporting network graph with {binary_name}.svg" ) - draw_graph(binary_name, output_dir, prog_info["edge_program"].graph_module) + draw_graph(binary_name, output_dir, prog_info["exported_program"].graph_module) logger.info( f"({index}/{num_bins}) exporting graph description with {binary_name}.json" ) diff --git a/examples/qualcomm/qaihub_scripts/utils/utils.py b/examples/qualcomm/qaihub_scripts/utils/utils.py index ad55d7fd10b..fc065b79af5 100644 --- a/examples/qualcomm/qaihub_scripts/utils/utils.py +++ b/examples/qualcomm/qaihub_scripts/utils/utils.py @@ -9,11 +9,16 @@ import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor from executorch.backends.qualcomm.utils.utils import ( - canonicalize_program, generate_qnn_executorch_option, + update_spill_fill_size, ) -from executorch.exir.backend.backend_api import to_backend -from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass + + +def preprocess_binary(ctx_bin, compiler_specs): + qnn_mgr = PyQnnManagerAdaptor.QnnManager( + generate_qnn_executorch_option(compiler_specs), + ) + return bytes(qnn_mgr.MakeBinaryInfo(ctx_bin)) def get_encoding( @@ -26,16 +31,17 @@ def get_encoding( ): encoding_list = [] with open(path_to_shard, "rb") as f: - ctx_bin = f.read() + ctx_bin = preprocess_binary(f.read(), compiler_specs) qnn_mgr = PyQnnManagerAdaptor.QnnManager( generate_qnn_executorch_option(compiler_specs), ctx_bin ) assert qnn_mgr.Init().value == 0, "failed to load context binary" - qnn_mgr.AllocateTensor() + graph_name = qnn_mgr.GetGraphNames()[0] + qnn_mgr.AllocateTensor(graph_name) if get_input: encoding_input = {"scale": [], "offset": []} for i in range(num_input): - inputs = qnn_mgr.GetGraphInputs()[i] + inputs = qnn_mgr.GetGraphInputs(graph_name)[i] encoding = inputs.GetEncodings() encoding_input["scale"].append(encoding.data["scale"].item()) encoding_input["offset"].append(encoding.data["offset"].item()) @@ -43,7 +49,7 @@ def get_encoding( if get_output: encoding_output = {"scale": [], "offset": []} for i in range(num_output): - outputs = qnn_mgr.GetGraphOutputs()[i] + outputs = qnn_mgr.GetGraphOutputs(graph_name)[i] encoding = outputs.GetEncodings() encoding_output["scale"].append(encoding.data["scale"].item()) encoding_output["offset"].append(encoding.data["offset"].item()) @@ -52,35 +58,25 @@ def get_encoding( return encoding_list -def gen_pte_from_ctx_bin( - artifact, pte_names, compiler_specs, bundle_programs, custom_spill_fill=None -): - - # Lower with QnnBackend - lowered_modules = [ - to_backend("QnnBackend", prog["edge_program"], compiler_specs) - for prog in bundle_programs - ] +def gen_pte_from_ctx_bin(artifact, pte_names, bundle_programs, backend_config): + edge_prog_mgrs = [prog["edge_program_manager"] for prog in bundle_programs] # Setup spill-fill buffer for relieving runtime memory usage - canonicalize_program(lowered_modules, custom_buffer_size=custom_spill_fill) - # export pte files + update_spill_fill_size( + [ + prog_mgr._edge_programs[list(prog_mgr.methods)[0]] + for prog_mgr in edge_prog_mgrs + ] + ) + # Export pte files pte_files = [] for pte_name in pte_names: print(f"{pte_name} generating...") - memory_planning_pass = MemoryPlanningPass( - alloc_graph_input=False, - alloc_graph_output=False, - ) pte_files.append(f"{artifact}/{pte_name}.pte") - with open(pte_files[-1], "wb") as file: - file.write( - lowered_modules[0].buffer( - extract_delegate_segments=True, memory_planning=memory_planning_pass - ) - ) + with open(pte_files[-1], "wb") as f: + edge_prog_mgrs[0].to_executorch(config=backend_config).write_to_file(f) # GC for reducing host memory consuming bundle_programs.pop(0) - lowered_modules.pop(0) + edge_prog_mgrs.pop(0) gc.collect() return pte_files diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py index 56169e39a2e..7445ba4a5ec 100644 --- a/examples/qualcomm/scripts/export_example.py +++ b/examples/qualcomm/scripts/export_example.py @@ -5,9 +5,7 @@ import torch from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.utils import ( capture_program, generate_htp_compiler_spec, diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index 573e23640b2..8051d157166 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -12,9 +12,7 @@ import torch from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.utils import ( generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index 52c263f0a4b..dc517764f8d 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -17,9 +17,7 @@ import torch from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype -from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, -) +from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.utils import ( capture_program, generate_htp_compiler_spec, @@ -138,7 +136,7 @@ def push(self, inputs=None, input_list=None, files=None): for file_name in files: self._adb(["push", file_name, self.workspace]) - def execute(self, custom_runner_cmd=None): + def execute(self, custom_runner_cmd=None, method_index=0): self._adb(["shell", f"mkdir -p {self.output_folder}"]) # run the delegation if custom_runner_cmd is None: @@ -155,6 +153,7 @@ def execute(self, custom_runner_cmd=None): if self.dump_intermediate_outputs else "" ), + f"--method_index {method_index}", ] ) qnn_executor_runner_cmds = " ".join( From 8437526e779c99f2788cd4611d405735e9b36ea7 Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Sun, 17 Nov 2024 23:28:03 +0800 Subject: [PATCH 2/3] fix import error in partitioner_lib.py --- extension/llm/export/partitioner_lib.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 6f4b95e3d08..9920da9574c 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -160,10 +160,8 @@ def get_qnn_partitioner( QnnPartitioner, ) - # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.serialization.qnn_compile_spec_schema` - from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( - QcomChipset, - ) + # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.serialization.qc_schema` + from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils` from executorch.backends.qualcomm.utils.utils import ( From 9032c34413ec7a0fd1d40c73739d44afdadd4b9f Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Mon, 18 Nov 2024 13:52:32 +0800 Subject: [PATCH 3/3] apply bzl changes --- backends/qualcomm/aot/python/targets.bzl | 1 + backends/qualcomm/runtime/targets.bzl | 2 ++ backends/qualcomm/targets.bzl | 33 ++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/backends/qualcomm/aot/python/targets.bzl b/backends/qualcomm/aot/python/targets.bzl index e1f5a6a8fc5..8eb8d095c30 100644 --- a/backends/qualcomm/aot/python/targets.bzl +++ b/backends/qualcomm/aot/python/targets.bzl @@ -31,6 +31,7 @@ def define_common_targets(): "//executorch/backends/qualcomm/aot/wrappers:wrappers", "//executorch/backends/qualcomm/runtime:logging", "//executorch/backends/qualcomm:schema", + "//executorch/backends/qualcomm:qc_binary_info_schema", "//executorch/backends/qualcomm/aot/ir:qcir_utils", "//executorch/backends/qualcomm/runtime:runtime", "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl index f7a3e220dee..73d333f52dd 100644 --- a/backends/qualcomm/runtime/targets.bzl +++ b/backends/qualcomm/runtime/targets.bzl @@ -29,6 +29,7 @@ def define_common_targets(): ], exported_deps = [ "//executorch/backends/qualcomm:schema", + "//executorch/backends/qualcomm:qc_binary_info_schema", "//executorch/runtime/core:core", ], ) @@ -63,6 +64,7 @@ def define_common_targets(): "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), ":logging", "//executorch/backends/qualcomm:schema", + "//executorch/backends/qualcomm:qc_binary_info_schema", "//executorch/backends/qualcomm/aot/ir:qcir_utils", "//executorch/backends/qualcomm/aot/wrappers:wrappers", "//executorch/runtime/backend:interface", diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl index 08d163eefc3..14e02989e5c 100644 --- a/backends/qualcomm/targets.bzl +++ b/backends/qualcomm/targets.bzl @@ -16,6 +16,12 @@ SCHEMA_GEN_RULE_NAME = "qc_compiler_spec_generated" SCHEMA_LIRRARY_NAME = SCHEMA_NAME +QC_BINARY_INFO_SCHEMA = "qc_binary_info" +QC_BINARY_INFO_INPUT_SCHEMA = "serialization/" + QC_BINARY_INFO_SCHEMA + ".fbs" +QC_BINARY_INFO_SCHEMA_GEN_RULE_NAME = QC_BINARY_INFO_SCHEMA + "_generated" +QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER = QC_BINARY_INFO_SCHEMA_GEN_RULE_NAME + ".h" +QC_BINARY_INFO_SCHEMA_LIRRARY_NAME = QC_BINARY_INFO_SCHEMA + def generate_schema_header(rule_name, srcs, headers, default_header): """Generate header file given flatbuffer schema """ @@ -77,6 +83,33 @@ def define_common_targets(): platforms = [ANDROID], ) + generate_schema_header( + QC_BINARY_INFO_SCHEMA_GEN_RULE_NAME, + [QC_BINARY_INFO_INPUT_SCHEMA], + [QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER], + QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER, + ) + + runtime.cxx_library( + name = "qc_binary_info_schema", + srcs = [], + visibility = [ + # Lock this down as tightly as possible to ensure that flatbuffers + # are an implementation detail. Ideally this list would only include + # //executorch/runtime/executor/... + "//executorch/codegen/tools/...", + "//executorch/runtime/executor/...", + "//executorch/backends/qualcomm/...", + "//executorch/backends/qualcomm/runtime/...", + ], + exported_headers = { + QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER: ":{}[{}]".format( QC_BINARY_INFO_SCHEMA_GEN_RULE_NAME, QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER), + }, + exported_external_deps = ["flatbuffers-api"], + define_static_target = True, + platforms = [ANDROID], + ) + runtime.cxx_library( name = "qnn_executorch_backend", srcs = [],