-
Notifications
You must be signed in to change notification settings - Fork 537
Dev weight sharing #6657
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Dev weight sharing #6657
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,10 +8,11 @@ | |
#pragma once | ||
#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h> | ||
#include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h> | ||
#include <executorch/backends/qualcomm/qc_binary_info_generated.h> | ||
#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h> | ||
#include <executorch/backends/qualcomm/runtime/Logging.h> | ||
#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h> | ||
#include <executorch/backends/qualcomm/runtime/QnnManager.h> | ||
#include <executorch/backends/qualcomm/schema_generated.h> | ||
#include <pybind11/numpy.h> | ||
#include <pybind11/pybind11.h> | ||
#include <pybind11/stl.h> | ||
|
@@ -35,32 +36,127 @@ class PyQnnManager { | |
qnn_manager_ = std::make_shared<QnnManager>( | ||
qnn_executorch_options, qnn_executorch_context_binary_); | ||
} | ||
|
||
// used for loading context binary directly | ||
explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin) | ||
: qnn_executorch_option_ptr_(buffer) { | ||
auto qnn_executorch_options = GetQnnExecuTorchOptions( | ||
qnn_executorch_option_ptr_.cast<std::string_view>().data()); | ||
|
||
py::buffer_info info(py::buffer(ctx_bin).request()); | ||
qnn_executorch_context_binary_.buffer = static_cast<void*>(info.ptr); | ||
qnn_executorch_context_binary_.buffer = info.ptr; | ||
qnn_executorch_context_binary_.nbytes = info.size * info.itemsize; | ||
qnn_manager_ = std::make_shared<QnnManager>( | ||
qnn_executorch_options, qnn_executorch_context_binary_); | ||
} | ||
|
||
// used for loading multiple graphs in qcir | ||
explicit PyQnnManager(const py::bytes& buffer, const py::list& qcirs) | ||
: qnn_executorch_option_ptr_(buffer) { | ||
auto qnn_executorch_options = GetQnnExecuTorchOptions( | ||
qnn_executorch_option_ptr_.cast<std::string_view>().data()); | ||
|
||
// merge multiple qcirs into one context with multiple graphs | ||
std::vector<flatbuffers::Offset<qcir::Graph>> graphs; | ||
for (size_t i = 0; i < qcirs.size(); ++i) { | ||
py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request()); | ||
flatbuffers::Verifier verifier_binary_info( | ||
static_cast<const uint8_t* const>(info.ptr), | ||
info.size * info.itemsize); | ||
if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) { | ||
QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info"); | ||
return; | ||
} | ||
auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr); | ||
|
||
flatbuffers::Verifier verifier_qcir( | ||
binary_info->data()->data(), binary_info->data()->size()); | ||
if (!qcir::VerifyContextBuffer(verifier_qcir)) { | ||
QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format"); | ||
return; | ||
} | ||
auto context = qcir::GetContext(binary_info->data()->data()); | ||
for (const auto& graph : *context->graphs()) { | ||
std::vector<flatbuffers::Offset<qcir::Tensor>> tensors; | ||
for (const auto tensor : *graph->tensors()) { | ||
// here we need to take a detour to merge multiple qcir flatbuffers | ||
// outer ToTensor | ||
// return: flatbuffers::Offset<Tensor> | ||
// consume: QnnTensor, flatbuffers::FlatBufferBuilder* | ||
// inner ToTensor | ||
// return: QnnTensor | ||
// consume: flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>> | ||
tensors.emplace_back(ToTensor(ToTensor(tensor), &builder_)); | ||
} | ||
std::vector<flatbuffers::Offset<qcir::Operator>> nodes; | ||
for (const auto& node : *graph->nodes()) { | ||
int32_t* inputs_ptr = const_cast<int32_t*>(node->inputs()->data()); | ||
int32_t* outputs_ptr = const_cast<int32_t*>(node->outputs()->data()); | ||
int32_t* params_ptr = const_cast<int32_t*>(node->params()->data()); | ||
std::vector<int32_t> inputs( | ||
inputs_ptr, inputs_ptr + node->inputs()->size()); | ||
std::vector<int32_t> outputs( | ||
outputs_ptr, outputs_ptr + node->outputs()->size()); | ||
std::vector<int32_t> params( | ||
params_ptr, params_ptr + node->params()->size()); | ||
nodes.emplace_back(qcir::CreateOperatorDirect( | ||
builder_, | ||
node->name()->str().c_str(), | ||
node->package_name()->str().c_str(), | ||
node->type_name()->str().c_str(), | ||
&inputs, | ||
&outputs, | ||
¶ms)); | ||
} | ||
graphs.emplace_back(qcir::CreateGraphDirect( | ||
builder_, graph->name()->str().c_str(), &nodes, &tensors)); | ||
} | ||
} | ||
|
||
auto context = qcir::CreateContextDirect(builder_, &graphs); | ||
builder_.Finish(context); | ||
QnnExecuTorchContextBinary qcir_bin( | ||
{builder_.GetBufferPointer(), builder_.GetSize()}); | ||
|
||
qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin); | ||
qnn_manager_ = std::make_shared<QnnManager>( | ||
qnn_executorch_options, qnn_executorch_context_binary_); | ||
} | ||
|
||
executorch::runtime::Error Init() { | ||
return qnn_manager_->Init(); | ||
} | ||
|
||
bool IsNodeSupportedByBackend( | ||
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) { | ||
return qnn_manager_->IsNodeSupportedByBackend(op_wrappers); | ||
} | ||
|
||
// this method is specific for compiling multi-graphs | ||
py::array_t<char> Compile() { | ||
if (qnn_manager_->CompileQcir() != Error::Ok) { | ||
QNN_EXECUTORCH_LOG_ERROR("Fail to compile qcir"); | ||
return py::array_t<char>(0); | ||
} | ||
|
||
// generate context binary if compilation succeded | ||
QnnExecuTorchContextBinary binary_info; | ||
qnn_manager_->GetContextBinary(binary_info); | ||
// allocate py::array (to pass the result of the C++ function to Python) | ||
auto result = py::array_t<char>(binary_info.nbytes); | ||
auto result_buffer = result.request(); | ||
char* result_ptr = (char*)result_buffer.ptr; | ||
std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes); | ||
return result; | ||
} | ||
|
||
py::array_t<char> Compile( | ||
const std::string& graph_name, | ||
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) { | ||
QnnExecuTorchContextBinary context_binary; | ||
flatbuffers::FlatBufferBuilder builder; | ||
QnnExecuTorchContextBinary binary_info; | ||
|
||
if (qnn_manager_->IsOnlinePrepare()) { | ||
if (qnn_manager_->IsOnlinePrepare() || qnn_manager_->IsMultipleGraphs()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it mean qnn manager can support both online prepare and multiple graph? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This |
||
builder_.Reset(); | ||
std::vector<flatbuffers::Offset<qcir::Tensor>> tensors; | ||
std::unordered_map<void*, int> tensor_map; | ||
|
||
|
@@ -74,7 +170,7 @@ class PyQnnManager { | |
tensor_map[wrapper.get()] = i; | ||
index.push_back(i); | ||
tensors.emplace_back( | ||
ToTensor(wrapper->CloneTensorStruct(), &builder)); | ||
ToTensor(wrapper->CloneTensorStruct(), &builder_)); | ||
} | ||
}; | ||
|
||
|
@@ -112,38 +208,48 @@ class PyQnnManager { | |
QNN_VER_PTR(t)->clientBuf.dataSize = | ||
GetDataTypeSize(QNN_VER_PTR(t)->dataType); | ||
params.push_back(tensors.size()); | ||
tensors.emplace_back(ToTensor(t, &builder)); | ||
tensors.emplace_back(ToTensor(t, &builder_)); | ||
} | ||
} | ||
|
||
Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig(); | ||
operators.emplace_back(qcir::CreateOperatorDirect( | ||
builder, | ||
builder_, | ||
QNN_VER_PTR(op_config)->name, | ||
QNN_VER_PTR(op_config)->packageName, | ||
QNN_VER_PTR(op_config)->typeName, | ||
&inputs, | ||
&outputs, | ||
¶ms)); | ||
} | ||
auto graph = qcir::CreateGraphDirect(builder, &operators, &tensors); | ||
builder.Finish(graph); | ||
context_binary.buffer = builder.GetBufferPointer(); | ||
context_binary.nbytes = builder.GetSize(); | ||
} else if ( | ||
qnn_manager_->Compile(op_wrappers, context_binary) != | ||
executorch::runtime::Error::Ok) { | ||
return py::array_t<char>(0); | ||
auto graph = qcir::CreateGraphDirect( | ||
builder_, graph_name.c_str(), &operators, &tensors); | ||
std::vector<flatbuffers::Offset<qcir::Graph>> graphs({graph}); | ||
auto context = qcir::CreateContextDirect(builder_, &graphs); | ||
builder_.Finish(context); | ||
QnnExecuTorchContextBinary qcir_binary( | ||
{builder_.GetBufferPointer(), builder_.GetSize()}); | ||
binary_info = MakeBinaryInfo(qcir_binary); | ||
} else { | ||
if (qnn_manager_->Compile(graph_name, op_wrappers) != | ||
executorch::runtime::Error::Ok) { | ||
QNN_EXECUTORCH_LOG_ERROR("Fail to compile QNN graph"); | ||
return py::array_t<char>(0); | ||
} | ||
if (qnn_manager_->GetContextBinary(binary_info) != | ||
executorch::runtime::Error::Ok) { | ||
return py::array_t<char>(0); | ||
} | ||
} | ||
|
||
// allocate py::array (to pass the result of the C++ function to | ||
// Python) | ||
auto result = py::array_t<char>(context_binary.nbytes); | ||
// allocate py::array (to pass the result of the C++ function to Python) | ||
auto result = py::array_t<char>(binary_info.nbytes); | ||
auto result_buffer = result.request(); | ||
char* result_ptr = (char*)result_buffer.ptr; | ||
std::memcpy(result_ptr, context_binary.buffer, context_binary.nbytes); | ||
std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes); | ||
return result; | ||
} | ||
|
||
void Destroy() { | ||
return qnn_manager_->Destroy(); | ||
} | ||
|
@@ -156,38 +262,76 @@ class PyQnnManager { | |
return qnn_manager_->IsTensorDump(); | ||
} | ||
|
||
executorch::runtime::Error AllocateTensor() { | ||
return qnn_manager_->AllocateTensor(); | ||
executorch::runtime::Error AllocateTensor(const std::string& graph_name) { | ||
return qnn_manager_->AllocateTensor(graph_name); | ||
} | ||
|
||
py::list GetGraphInputs() { | ||
py::list GetGraphInputs(const std::string& graph_name) { | ||
py::list ret; | ||
for (const std::shared_ptr<TensorWrapper>& input : | ||
qnn_manager_->GetGraphInputs()) { | ||
qnn_manager_->GetGraphInputs(graph_name)) { | ||
ret.append(PyQnnTensorWrapper(input)); | ||
} | ||
return ret; | ||
} | ||
|
||
py::list GetGraphOutputs() { | ||
py::list GetGraphOutputs(const std::string& graph_name) { | ||
py::list ret; | ||
for (const std::shared_ptr<TensorWrapper>& output : | ||
qnn_manager_->GetGraphOutputs()) { | ||
qnn_manager_->GetGraphOutputs(graph_name)) { | ||
ret.append(PyQnnTensorWrapper(output)); | ||
} | ||
return ret; | ||
} | ||
|
||
py::list GetGraphNames() { | ||
py::list ret; | ||
for (const std::string& graph_name : qnn_manager_->GetGraphNames()) { | ||
ret.append(graph_name); | ||
} | ||
return ret; | ||
} | ||
|
||
uint64_t GetSpillFillBufferSize() { | ||
return qnn_manager_->GetSpillFillBufferSize(); | ||
} | ||
|
||
py::array_t<char> MakeBinaryInfo(const py::bytes& ctx_bin) { | ||
py::buffer_info info(py::buffer(ctx_bin).request()); | ||
QnnExecuTorchContextBinary binary( | ||
{info.ptr, static_cast<uint64_t>(info.size * info.itemsize)}); | ||
auto binary_info = MakeBinaryInfo(binary); | ||
auto result = py::array_t<char>(binary_info.nbytes); | ||
auto result_buffer = result.request(); | ||
std::memcpy(result_buffer.ptr, binary_info.buffer, binary_info.nbytes); | ||
return result; | ||
} | ||
|
||
private: | ||
QnnExecuTorchContextBinary MakeBinaryInfo( | ||
const QnnExecuTorchContextBinary& ctx_bin) { | ||
auto signature = []() { | ||
return std::to_string( | ||
std::chrono::high_resolution_clock::now().time_since_epoch().count()); | ||
}; | ||
const uint8_t* base = static_cast<uint8_t*>(ctx_bin.buffer); | ||
std::vector<uint8_t> data(base, base + ctx_bin.nbytes); | ||
// add signature to binary for cache reuse in runtime | ||
builder_.Reset(); | ||
auto binary_info = qnn_delegate::CreateBinaryInfoDirect( | ||
builder_, signature().c_str(), &data); | ||
builder_.Finish(binary_info); | ||
|
||
return QnnExecuTorchContextBinary( | ||
{builder_.GetBufferPointer(), builder_.GetSize()}); | ||
} | ||
|
||
// Store the bytes object instead of a raw pointer so that this module will | ||
// keep the bytes alive. | ||
const py::bytes qnn_executorch_option_ptr_; | ||
QnnExecuTorchContextBinary qnn_executorch_context_binary_; | ||
std::shared_ptr<QnnManager> qnn_manager_; | ||
flatbuffers::FlatBufferBuilder builder_; | ||
}; | ||
} // namespace qnn | ||
} // namespace backends | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does `ToTensor(ToTensor(tensor), &builder_)) mean?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm guessing here we'll deduplicate tensor?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The inner
ToTensor
is used to convert serialized tensor inqcir
toQnnTensor
defined in QNN SDK header. The outerToTensor
is used to convertQnnTensor
to flatbuffer API compatible tensor for buildingqcir
.Looks like flatbuffer has no mechanism for merging binaries, this is the detour I can come up with so far.
Will rephrase the comment for better understanding.