Skip to content

Qualcomm AI Engine Direct - Enable zero copy feature #2531

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 28 additions & 13 deletions backends/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,27 +118,29 @@ include_directories(
#
# declare targets
#
add_library(executorch_backend INTERFACE)
add_library(qcir INTERFACE qcir_schema_output)
add_library(qcir_utils STATIC)
add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
add_library(executorch_backend INTERFACE)
add_library(qnn_backend STATIC)
add_library(qnn_backend_cache STATIC)
add_library(qnn_context STATIC)
add_library(qnn_device STATIC)
add_library(qnn_executorch_backend SHARED)
add_library(qnn_executorch_header INTERFACE)
add_library(qnn_executorch_logging STATIC)
add_library(qnn_manager STATIC)
add_library(qnn_factory STATIC)
add_library(qnn_function_interface INTERFACE)
add_library(qnn_graph STATIC)
add_library(qnn_header INTERFACE)
add_library(qnn_implementation STATIC)
add_library(qnn_sys_function_interface INTERFACE)
add_library(qnn_sys_implementation STATIC)
add_library(qnn_logger STATIC)
add_library(qnn_manager STATIC)
add_library(qnn_mem_manager STATIC)
add_library(qnn_profiler STATIC)
add_library(qnn_device STATIC)
add_library(qnn_context STATIC)
add_library(qnn_backend_cache STATIC)
add_library(qnn_graph STATIC)
add_library(qnn_backend STATIC)
add_library(qnn_factory STATIC)
add_library(qnn_header INTERFACE)
add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
add_library(qnn_sys_function_interface INTERFACE)
add_library(qnn_sys_implementation STATIC)
add_library(shared_buffer STATIC)
add_library(wrappers STATIC)
add_library(utils STATIC)

Expand Down Expand Up @@ -220,6 +222,13 @@ target_link_libraries(qnn_graph
qnn_context
qnn_profiler
)
target_link_libraries(qnn_mem_manager
PRIVATE
qnn_executorch_logging
qnn_implementation
qnn_context
)

target_link_libraries(qnn_factory
PUBLIC
qnn_header
Expand All @@ -229,13 +238,15 @@ target_link_libraries(qnn_factory
qnn_device
qnn_context
qnn_graph
qnn_mem_manager
)
target_link_libraries(qnn_manager
PRIVATE
qnn_factory
wrappers
qnn_schema
utils
shared_buffer
)
target_link_libraries(qnn_executorch_backend
PRIVATE
Expand All @@ -249,7 +260,11 @@ target_link_libraries(utils
PRIVATE
qnn_executorch_logging
)

target_link_libraries(shared_buffer
PRIVATE
qnn_executorch_logging
${CMAKE_DL_LIBS}
)
#
# add linker option
#
Expand Down
7 changes: 7 additions & 0 deletions backends/qualcomm/aot/wrappers/TensorWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ TensorWrapper::TensorWrapper(

Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
if (data != nullptr) {
QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
if (copy_data) {
owned_data_ = std::make_unique<char[]>(bytes_);
Expand Down Expand Up @@ -144,6 +145,12 @@ Error TensorWrapper::SetName(const std::string& name) {
return Error::Ok;
}

Error TensorWrapper::SetMemHandle(Qnn_MemHandle_t mem_handle) {
QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(tensor_)->memHandle = mem_handle;
return Error::Ok;
}

// base function for Create TensorWrapper
std::shared_ptr<TensorWrapper> CreateTensorWrapper(
const std::string& tensor_name,
Expand Down
26 changes: 24 additions & 2 deletions backends/qualcomm/aot/wrappers/TensorWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,38 @@ class TensorWrapper {
return QNN_VER_PTR(tensor_)->type == QNN_TENSOR_TYPE_STATIC;
};

const void* GetStaticTensorData() const {
return QNN_VER_PTR(tensor_)->clientBuf.data;
std::uint32_t* GetDims() const {
return QNN_VER_PTR(tensor_)->dimensions;
};

Qnn_DataType_t GetDataType() const {
return QNN_VER_PTR(tensor_)->dataType;
};

Qnn_MemHandle_t const GetMemHandle() {
return QNN_VER_PTR(tensor_)->memHandle;
};

Qnn_TensorMemType_t GetMemType() const {
return QNN_VER_PTR(tensor_)->memType;
};

std::string GetName() const {
return qnn_tensor_name_;
};

std::uint32_t GetRank() const {
return QNN_VER_PTR(tensor_)->rank;
};

const void* GetStaticTensorData() const {
return QNN_VER_PTR(tensor_)->clientBuf.data;
};

Error SetName(const std::string& name);

Error SetMemHandle(Qnn_MemHandle_t mem_handle);

private:
// need this to handle QNN_TENSOR_ERROR_NAME_HASH_COLLISION
std::string qnn_tensor_name_;
Expand Down
6 changes: 6 additions & 0 deletions backends/qualcomm/passes/insert_io_qdq.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ def _ceate_args(self, target: torch.fx.node.Target, quant_attrs: Dict):
arg_schemas = list(target._schema.arguments)[1:]
for arg_schema in arg_schemas:
name = arg_schema.name
# TODO: Due to the new parameter "out_dtype" in the dequantize node,
# it could not be found in the quant_attrs of other nodes,
# and it will cause a key error. For now, the output type
# of our dequantize node is only float. (by default in pytorch)
if name == "out_dtype":
continue
value = quant_attrs[name]
if type(arg_schema.type) == torch.tensor and type(value) in [int, float]:
value = torch.tensor(value)
Expand Down
7 changes: 7 additions & 0 deletions backends/qualcomm/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,10 @@ target_sources(utils
PRIVATE
${CMAKE_CURRENT_LIST_DIR}/Utils.cpp
)

# shared_buffer
target_sources(shared_buffer
PRIVATE
${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.h
${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.cpp
)
12 changes: 12 additions & 0 deletions backends/qualcomm/runtime/QnnExecuTorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
#pragma once

#ifdef __cplusplus
#include <cstddef>
#include <cstdint>
#else
#include <stddef.h>
#include <stdint.h>
#endif

Expand All @@ -31,6 +33,16 @@ typedef struct {
}
// clang-format on

/// Allocate specific tensors (usually graph inputs and outputs) on shared
/// memory. Users are responsible to allocate "enough" tensor bytes, and set
/// alignment as MemoryAllocator::kDefaultAlignment.
/// See runtime/core/memory_allocator.h. The function returns a valid pointer
/// if allocation is successful.
void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);

/// Free the allocated shared memory.
void QnnExecuTorchFreeCustomMem(void* buffer_ptr);

#ifdef __cplusplus
}
#endif // __cplusplus
18 changes: 13 additions & 5 deletions backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,19 +188,27 @@ Error QnnExecuTorchBackend::execute(
std::vector<Qnn_Tensor_t> input_tensor_structs;
std::vector<Qnn_Tensor_t> output_tensor_structs;

input_tensor_structs.reserve(input_tensors.size());
for (int i = 0; i < input_tensors.size(); ++i) {
input_tensors[i]->FillDataBuffer(
args[i]->toTensor().const_data_ptr(), true /* copy_data */);
if (qnn_manager->RegisterMem(
args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
Error::Ok) {
input_tensors[i]->FillDataBuffer(
args[i]->toTensor().const_data_ptr(), true /* copy_data */);
}
input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
}

int output_index = input_tensors.size();
for (const auto& output_tensor : output_tensors) {
// pos=0 limits the search to the prefix
if (output_tensor->GetName().rfind("output_", 0) == 0) {
output_tensor->FillDataBuffer(
args[output_index]->toTensor().mutable_data_ptr(),
false /* copy_data */);
void* mutable_data_ptr =
args[output_index]->toTensor().mutable_data_ptr();
if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) !=
Error::Ok) {
output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */);
}
output_index++;
}
output_tensor_structs.push_back(output_tensor->CloneTensorStruct());
Expand Down
72 changes: 62 additions & 10 deletions backends/qualcomm/runtime/QnnManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
* LICENSE file in the root directory of this source tree.
*/
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
#include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
#include <executorch/backends/qualcomm/runtime/Utils.h>
#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>

#include <cstdlib>
#include <cstring>
#include <fstream>
Expand Down Expand Up @@ -54,7 +54,9 @@ QnnManager::QnnManager(
"the size of qnn context binary: %d",
qnn_executorch_context_binary.nbytes);
QNN_EXECUTORCH_LOG_INFO(
"Is on-device graph construction: %d", options_->online_prepare());
"Is on-device graph construction: %d", options->online_prepare());
QNN_EXECUTORCH_LOG_INFO(
"Enable shared buffer: %d", options->shared_buffer());
}

if (library_path.empty()) {
Expand Down Expand Up @@ -82,6 +84,53 @@ Error QnnManager::LoadQnnLibrary() {
return ret;
}

Error QnnManager::RegisterMem(
void* data_ptr,
const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
// Not enable shared buffer
if (!options_->shared_buffer())
return Error::Internal;

if (backend_params_ptr_->qnn_mem_manager_ptr_ == nullptr) {
QNN_EXECUTORCH_LOG_WARN(
"Backend %s doesn't supported shared buffer.",
EnumNameQnnExecuTorchBackendType(
options_->backend_options()->backend_type()));
return Error::Internal;
}

if (!shared_buffer_manager.IsAllocated(data_ptr)) {
// It means two scenarios here:
// 1. the input and output partitioned graph
// 2. Actually, user doesn't allocate shared buffer with
// QnnExecuTorchAllocCustomMem API
return Error::Internal;
} else if (backend_params_ptr_->qnn_mem_manager_ptr_->IsRegistered(
tensor_wrapper->GetMemHandle())) {
if (options_->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo)
QNN_EXECUTORCH_LOG_INFO(
"Tensor name %s has been registered shared memory.",
tensor_wrapper->GetName().c_str());
return Error::Ok;
}

int32_t mem_fd = SharedBuffer::GetSharedBufferManager().MemToFd(data_ptr);
if (mem_fd == -1) {
QNN_EXECUTORCH_LOG_WARN(
"Tensor name %s is failed to get file descriptor.",
tensor_wrapper->GetName().c_str());
return Error::Internal;
}
ET_CHECK_OR_RETURN_ERROR(
backend_params_ptr_->qnn_mem_manager_ptr_->RegisterMem(
tensor_wrapper, mem_fd) == Error::Ok,
Internal,
"Fail to register to shared memory.");

return Error::Ok;
}

Error QnnManager::Init() {
ET_CHECK_OR_RETURN_ERROR(
LoadQnnLibrary() == Error::Ok, Internal, "Fail to load Qnn library");
Expand Down Expand Up @@ -219,14 +268,6 @@ void QnnManager::Destroy() {
qnn_loaded_backend_.TerminateAllBackends();
}

bool QnnManager::IsAvailable() {
return true;
}

bool QnnManager::IsOnlinePrepare() {
return options_->online_prepare();
}

bool QnnManager::IsNodeSupportedByBackend(
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
Qnn_ErrorHandle_t error = QNN_SUCCESS;
Expand Down Expand Up @@ -329,3 +370,14 @@ Error QnnManager::Compile(
} // namespace qnn
} // namespace executor
} // namespace torch
void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
using torch::executor::qnn::SharedBuffer;
void* buffer_ptr =
SharedBuffer::GetSharedBufferManager().AllocMem(bytes, alignment);
return buffer_ptr;
}

void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
using torch::executor::qnn::SharedBuffer;
SharedBuffer::GetSharedBufferManager().FreeMem(buffer_ptr);
}
14 changes: 11 additions & 3 deletions backends/qualcomm/runtime/QnnManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,29 @@ class QnnManager {

void Destroy();

bool IsAvailable();
bool IsAvailable() {
return true;
}

bool IsOnlinePrepare() {
return options_->online_prepare();
}

bool IsTensorDump() {
return options_->tensor_dump_output_path()->size() > 0;
}

bool IsOnlinePrepare();

bool IsNodeSupportedByBackend(
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers);

Error Compile(
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers,
QnnExecuTorchContextBinary& qnn_executorch_context_binary);

Error RegisterMem(
void* data_ptr,
const std::shared_ptr<TensorWrapper>& tensor_wrapper);

std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs() {
return input_tensors_;
}
Expand Down
Loading