Skip to content

Dev weight sharing #6657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion backends/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ include_directories(
${EXECUTORCH_SOURCE_DIR}/third-party/flatbuffers/include
)

set(_qnn_schema__srcs backends/qualcomm/serialization/schema.fbs)
set(_qnn_schema__srcs
backends/qualcomm/serialization/qc_compiler_spec.fbs
backends/qualcomm/serialization/qc_binary_info.fbs
)
set(_qnn_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include")
# Paths to headers generated from the .fbs files.
set(_qnn_schema__outputs)
Expand Down
1 change: 1 addition & 0 deletions backends/qualcomm/_passes/remove_redundancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class RemoveRedundancy(ExportPass):
exir_ops.edge.aten.clone.default,
torch.ops.aten.alias.default,
exir_ops.edge.aten.alias.default,
exir_ops.edge.aten.lift_fresh_copy.default,
}

def __init__(self):
Expand Down
7 changes: 6 additions & 1 deletion backends/qualcomm/aot/ir/qcir.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,13 @@ table Operator {
}

table Graph {
name: string;
nodes: [Operator];
tensors: [Tensor];
}

root_type Graph;
table Context {
graphs: [Graph];
}

root_type Context;
8 changes: 2 additions & 6 deletions backends/qualcomm/aot/ir/qcir_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,7 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
}
} break;
default:
QNN_EXECUTORCH_LOG_WARN(
"QNN_QUANTIZATION_ENCODING_UNDEFINED detected: %s",
QNN_VER_PTR(tensor)->name);
// encodings are not required if lowering with floating point precision
break;
}
return CreateQuantizeParamDirect(
Expand Down Expand Up @@ -229,9 +227,7 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
const_cast<int32_t*>(param->offsets()->data());
} break;
default:
QNN_EXECUTORCH_LOG_WARN(
"qcir::QuantizeType::UNDEFINED detected: %s",
tensor->name()->c_str());
// encodings are not required if lowering with floating point precision
break;
}
return p;
Expand Down
14 changes: 12 additions & 2 deletions backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,26 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
py::class_<PyQnnManager, std::shared_ptr<PyQnnManager>>(m, "QnnManager")
.def(py::init<const py::bytes&>())
.def(py::init<const py::bytes&, const py::bytes&>())
.def(py::init<const py::bytes&, const py::list&>())
.def("Init", &PyQnnManager::Init)
.def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
.def("Compile", &PyQnnManager::Compile)
.def("Compile", py::overload_cast<>(&PyQnnManager::Compile))
.def(
"Compile",
py::overload_cast<
const std::string&,
std::vector<std::shared_ptr<OpWrapper>>&>(&PyQnnManager::Compile))
.def("Destroy", &PyQnnManager::Destroy)
.def("IsAvailable", &PyQnnManager::IsAvailable)
.def("IsTensorDump", &PyQnnManager::IsTensorDump)
.def("AllocateTensor", &PyQnnManager::AllocateTensor)
.def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
.def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs)
.def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize);
.def("GetGraphNames", &PyQnnManager::GetGraphNames)
.def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize)
.def(
"MakeBinaryInfo",
py::overload_cast<const py::bytes&>(&PyQnnManager::MakeBinaryInfo));
}
} // namespace qnn
} // namespace backends
Expand Down
196 changes: 170 additions & 26 deletions backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
#pragma once
#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
#include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
#include <executorch/backends/qualcomm/qc_binary_info_generated.h>
#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
#include <executorch/backends/qualcomm/runtime/Logging.h>
#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
#include <executorch/backends/qualcomm/schema_generated.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
Expand All @@ -35,32 +36,127 @@ class PyQnnManager {
qnn_manager_ = std::make_shared<QnnManager>(
qnn_executorch_options, qnn_executorch_context_binary_);
}

// used for loading context binary directly
explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin)
: qnn_executorch_option_ptr_(buffer) {
auto qnn_executorch_options = GetQnnExecuTorchOptions(
qnn_executorch_option_ptr_.cast<std::string_view>().data());

py::buffer_info info(py::buffer(ctx_bin).request());
qnn_executorch_context_binary_.buffer = static_cast<void*>(info.ptr);
qnn_executorch_context_binary_.buffer = info.ptr;
qnn_executorch_context_binary_.nbytes = info.size * info.itemsize;
qnn_manager_ = std::make_shared<QnnManager>(
qnn_executorch_options, qnn_executorch_context_binary_);
}

// used for loading multiple graphs in qcir
explicit PyQnnManager(const py::bytes& buffer, const py::list& qcirs)
: qnn_executorch_option_ptr_(buffer) {
auto qnn_executorch_options = GetQnnExecuTorchOptions(
qnn_executorch_option_ptr_.cast<std::string_view>().data());

// merge multiple qcirs into one context with multiple graphs
std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
for (size_t i = 0; i < qcirs.size(); ++i) {
py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
flatbuffers::Verifier verifier_binary_info(
static_cast<const uint8_t* const>(info.ptr),
info.size * info.itemsize);
if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
return;
}
auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr);

flatbuffers::Verifier verifier_qcir(
binary_info->data()->data(), binary_info->data()->size());
if (!qcir::VerifyContextBuffer(verifier_qcir)) {
QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format");
return;
}
auto context = qcir::GetContext(binary_info->data()->data());
for (const auto& graph : *context->graphs()) {
std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
for (const auto tensor : *graph->tensors()) {
// here we need to take a detour to merge multiple qcir flatbuffers
// outer ToTensor
// return: flatbuffers::Offset<Tensor>
// consume: QnnTensor, flatbuffers::FlatBufferBuilder*
// inner ToTensor
// return: QnnTensor
// consume: flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>
tensors.emplace_back(ToTensor(ToTensor(tensor), &builder_));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does `ToTensor(ToTensor(tensor), &builder_)) mean?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing here we'll deduplicate tensor?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inner ToTensor is used to convert serialized tensor in qcir to QnnTensor defined in QNN SDK header. The outer ToTensor is used to convert QnnTensor to flatbuffer API compatible tensor for building qcir.

Looks like flatbuffer has no mechanism for merging binaries, this is the detour I can come up with so far.
Will rephrase the comment for better understanding.

}
std::vector<flatbuffers::Offset<qcir::Operator>> nodes;
for (const auto& node : *graph->nodes()) {
int32_t* inputs_ptr = const_cast<int32_t*>(node->inputs()->data());
int32_t* outputs_ptr = const_cast<int32_t*>(node->outputs()->data());
int32_t* params_ptr = const_cast<int32_t*>(node->params()->data());
std::vector<int32_t> inputs(
inputs_ptr, inputs_ptr + node->inputs()->size());
std::vector<int32_t> outputs(
outputs_ptr, outputs_ptr + node->outputs()->size());
std::vector<int32_t> params(
params_ptr, params_ptr + node->params()->size());
nodes.emplace_back(qcir::CreateOperatorDirect(
builder_,
node->name()->str().c_str(),
node->package_name()->str().c_str(),
node->type_name()->str().c_str(),
&inputs,
&outputs,
&params));
}
graphs.emplace_back(qcir::CreateGraphDirect(
builder_, graph->name()->str().c_str(), &nodes, &tensors));
}
}

auto context = qcir::CreateContextDirect(builder_, &graphs);
builder_.Finish(context);
QnnExecuTorchContextBinary qcir_bin(
{builder_.GetBufferPointer(), builder_.GetSize()});

qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin);
qnn_manager_ = std::make_shared<QnnManager>(
qnn_executorch_options, qnn_executorch_context_binary_);
}

executorch::runtime::Error Init() {
return qnn_manager_->Init();
}

bool IsNodeSupportedByBackend(
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
return qnn_manager_->IsNodeSupportedByBackend(op_wrappers);
}

// this method is specific for compiling multi-graphs
py::array_t<char> Compile() {
if (qnn_manager_->CompileQcir() != Error::Ok) {
QNN_EXECUTORCH_LOG_ERROR("Fail to compile qcir");
return py::array_t<char>(0);
}

// generate context binary if compilation succeded
QnnExecuTorchContextBinary binary_info;
qnn_manager_->GetContextBinary(binary_info);
// allocate py::array (to pass the result of the C++ function to Python)
auto result = py::array_t<char>(binary_info.nbytes);
auto result_buffer = result.request();
char* result_ptr = (char*)result_buffer.ptr;
std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes);
return result;
}

py::array_t<char> Compile(
const std::string& graph_name,
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
QnnExecuTorchContextBinary context_binary;
flatbuffers::FlatBufferBuilder builder;
QnnExecuTorchContextBinary binary_info;

if (qnn_manager_->IsOnlinePrepare()) {
if (qnn_manager_->IsOnlinePrepare() || qnn_manager_->IsMultipleGraphs()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it mean qnn manager can support both online prepare and multiple graph?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This Compile method is invoked in qnn_preprocess.py. Once one of these two compiler specs are recognized, qcir will be returned instead of generating context binary.
If in online_prepare mode, user could directly ship the generated pte and let QnnManager compose graph on device side.
Although multiple_graph gives the same binary format that could be used in the same scenario of online_prepare. We would expect user follow the example in our test cases, because the optimization level in HTP will be different (host side will be higher and generating more computation efficient context binary).

builder_.Reset();
std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
std::unordered_map<void*, int> tensor_map;

Expand All @@ -74,7 +170,7 @@ class PyQnnManager {
tensor_map[wrapper.get()] = i;
index.push_back(i);
tensors.emplace_back(
ToTensor(wrapper->CloneTensorStruct(), &builder));
ToTensor(wrapper->CloneTensorStruct(), &builder_));
}
};

Expand Down Expand Up @@ -112,38 +208,48 @@ class PyQnnManager {
QNN_VER_PTR(t)->clientBuf.dataSize =
GetDataTypeSize(QNN_VER_PTR(t)->dataType);
params.push_back(tensors.size());
tensors.emplace_back(ToTensor(t, &builder));
tensors.emplace_back(ToTensor(t, &builder_));
}
}

Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig();
operators.emplace_back(qcir::CreateOperatorDirect(
builder,
builder_,
QNN_VER_PTR(op_config)->name,
QNN_VER_PTR(op_config)->packageName,
QNN_VER_PTR(op_config)->typeName,
&inputs,
&outputs,
&params));
}
auto graph = qcir::CreateGraphDirect(builder, &operators, &tensors);
builder.Finish(graph);
context_binary.buffer = builder.GetBufferPointer();
context_binary.nbytes = builder.GetSize();
} else if (
qnn_manager_->Compile(op_wrappers, context_binary) !=
executorch::runtime::Error::Ok) {
return py::array_t<char>(0);
auto graph = qcir::CreateGraphDirect(
builder_, graph_name.c_str(), &operators, &tensors);
std::vector<flatbuffers::Offset<qcir::Graph>> graphs({graph});
auto context = qcir::CreateContextDirect(builder_, &graphs);
builder_.Finish(context);
QnnExecuTorchContextBinary qcir_binary(
{builder_.GetBufferPointer(), builder_.GetSize()});
binary_info = MakeBinaryInfo(qcir_binary);
} else {
if (qnn_manager_->Compile(graph_name, op_wrappers) !=
executorch::runtime::Error::Ok) {
QNN_EXECUTORCH_LOG_ERROR("Fail to compile QNN graph");
return py::array_t<char>(0);
}
if (qnn_manager_->GetContextBinary(binary_info) !=
executorch::runtime::Error::Ok) {
return py::array_t<char>(0);
}
}

// allocate py::array (to pass the result of the C++ function to
// Python)
auto result = py::array_t<char>(context_binary.nbytes);
// allocate py::array (to pass the result of the C++ function to Python)
auto result = py::array_t<char>(binary_info.nbytes);
auto result_buffer = result.request();
char* result_ptr = (char*)result_buffer.ptr;
std::memcpy(result_ptr, context_binary.buffer, context_binary.nbytes);
std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes);
return result;
}

void Destroy() {
return qnn_manager_->Destroy();
}
Expand All @@ -156,38 +262,76 @@ class PyQnnManager {
return qnn_manager_->IsTensorDump();
}

executorch::runtime::Error AllocateTensor() {
return qnn_manager_->AllocateTensor();
executorch::runtime::Error AllocateTensor(const std::string& graph_name) {
return qnn_manager_->AllocateTensor(graph_name);
}

py::list GetGraphInputs() {
py::list GetGraphInputs(const std::string& graph_name) {
py::list ret;
for (const std::shared_ptr<TensorWrapper>& input :
qnn_manager_->GetGraphInputs()) {
qnn_manager_->GetGraphInputs(graph_name)) {
ret.append(PyQnnTensorWrapper(input));
}
return ret;
}

py::list GetGraphOutputs() {
py::list GetGraphOutputs(const std::string& graph_name) {
py::list ret;
for (const std::shared_ptr<TensorWrapper>& output :
qnn_manager_->GetGraphOutputs()) {
qnn_manager_->GetGraphOutputs(graph_name)) {
ret.append(PyQnnTensorWrapper(output));
}
return ret;
}

py::list GetGraphNames() {
py::list ret;
for (const std::string& graph_name : qnn_manager_->GetGraphNames()) {
ret.append(graph_name);
}
return ret;
}

uint64_t GetSpillFillBufferSize() {
return qnn_manager_->GetSpillFillBufferSize();
}

py::array_t<char> MakeBinaryInfo(const py::bytes& ctx_bin) {
py::buffer_info info(py::buffer(ctx_bin).request());
QnnExecuTorchContextBinary binary(
{info.ptr, static_cast<uint64_t>(info.size * info.itemsize)});
auto binary_info = MakeBinaryInfo(binary);
auto result = py::array_t<char>(binary_info.nbytes);
auto result_buffer = result.request();
std::memcpy(result_buffer.ptr, binary_info.buffer, binary_info.nbytes);
return result;
}

private:
QnnExecuTorchContextBinary MakeBinaryInfo(
const QnnExecuTorchContextBinary& ctx_bin) {
auto signature = []() {
return std::to_string(
std::chrono::high_resolution_clock::now().time_since_epoch().count());
};
const uint8_t* base = static_cast<uint8_t*>(ctx_bin.buffer);
std::vector<uint8_t> data(base, base + ctx_bin.nbytes);
// add signature to binary for cache reuse in runtime
builder_.Reset();
auto binary_info = qnn_delegate::CreateBinaryInfoDirect(
builder_, signature().c_str(), &data);
builder_.Finish(binary_info);

return QnnExecuTorchContextBinary(
{builder_.GetBufferPointer(), builder_.GetSize()});
}

// Store the bytes object instead of a raw pointer so that this module will
// keep the bytes alive.
const py::bytes qnn_executorch_option_ptr_;
QnnExecuTorchContextBinary qnn_executorch_context_binary_;
std::shared_ptr<QnnManager> qnn_manager_;
flatbuffers::FlatBufferBuilder builder_;
};
} // namespace qnn
} // namespace backends
Expand Down
1 change: 1 addition & 0 deletions backends/qualcomm/aot/python/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def define_common_targets():
"//executorch/backends/qualcomm/aot/wrappers:wrappers",
"//executorch/backends/qualcomm/runtime:logging",
"//executorch/backends/qualcomm:schema",
"//executorch/backends/qualcomm:qc_binary_info_schema",
"//executorch/backends/qualcomm/aot/ir:qcir_utils",
"//executorch/backends/qualcomm/runtime:runtime",
"fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
Expand Down
Loading
Loading