From e541fb4695f625b292c2a94c6afa21a4d7863188 Mon Sep 17 00:00:00 2001
From: shewu-quic <quic_shewu@quicinc.com>
Date: Thu, 14 Mar 2024 16:55:59 +0800
Subject: [PATCH] Qualcomm AI Engine Direct - Enable zero copy feature Summary:
 - Add argument "shared_buffer" into compiler_spec, qnn_executor_runner and
 test scripts   -  Actually, shared_buffer should be a runtime option since
 user are responsible to allocate memory for tensors on device. But it seems
 to have no way to set the runtime option to QnnBackend. Therefore, we put it
 to compile_spec for now. - Implement SharedBuffer to allocate and free RPC
 memory - Add QnnMemManger to register shared buffer for tensor   - During
 exection time, we will register memory of tensor data for QNN. And we will
 deregister them during destruction time of QnnBackend - Add two API `void*
 QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment)` and `void
 QnnExecuTorchFreeCustomMem(void* buffer_ptr)` to allocate RPC memory with
 SharedBuffer   -  Users are responsible to allocate "enough" tensor bytes,
 and set alignment as MemoryAllocator::kDefaultAlignment. See
 runtime/core/memory_allocator.h.

---
 backends/qualcomm/CMakeLists.txt              |  41 +++--
 .../qualcomm/aot/wrappers/TensorWrapper.cpp   |   7 +
 .../qualcomm/aot/wrappers/TensorWrapper.h     |  26 +++-
 backends/qualcomm/passes/insert_io_qdq.py     |   6 +
 backends/qualcomm/runtime/CMakeLists.txt      |   7 +
 backends/qualcomm/runtime/QnnExecuTorch.h     |  12 ++
 .../qualcomm/runtime/QnnExecuTorchBackend.cpp |  18 ++-
 backends/qualcomm/runtime/QnnManager.cpp      |  72 +++++++--
 backends/qualcomm/runtime/QnnManager.h        |  14 +-
 backends/qualcomm/runtime/SharedBuffer.cpp    | 136 ++++++++++++++++
 backends/qualcomm/runtime/SharedBuffer.h      |  72 +++++++++
 .../qualcomm/runtime/backends/CMakeLists.txt  |   8 +
 .../runtime/backends/QnnBackendFactory.cpp    |   2 +
 .../runtime/backends/QnnBackendFactory.h      |   6 +-
 .../runtime/backends/QnnMemManager.cpp        |  66 ++++++++
 .../qualcomm/runtime/backends/QnnMemManager.h |  43 ++++++
 .../serialization/qnn_compile_spec_schema.py  |   1 +
 backends/qualcomm/serialization/schema.fbs    |   3 +
 backends/qualcomm/tests/test_qnn_delegate.py  |  62 ++++++++
 backends/qualcomm/tests/utils.py              |  16 +-
 backends/qualcomm/utils/utils.py              |   6 +
 examples/qualcomm/CMakeLists.txt              |   2 +-
 .../executor_runner/qnn_executor_runner.cpp   | 146 ++++++++++++++----
 examples/qualcomm/scripts/deeplab_v3.py       |   2 +
 examples/qualcomm/scripts/dummy_llama2.py     |   2 +
 examples/qualcomm/scripts/edsr.py             |   2 +
 examples/qualcomm/scripts/export_example.py   |   8 +-
 examples/qualcomm/scripts/inception_v3.py     |   2 +
 examples/qualcomm/scripts/inception_v4.py     |   2 +
 .../qualcomm/scripts/mobilebert_fine_tune.py  |   2 +
 examples/qualcomm/scripts/mobilenet_v2.py     |   2 +
 examples/qualcomm/scripts/torchvision_vit.py  |   2 +
 examples/qualcomm/scripts/utils.py            |  26 +++-
 33 files changed, 751 insertions(+), 71 deletions(-)
 create mode 100644 backends/qualcomm/runtime/SharedBuffer.cpp
 create mode 100644 backends/qualcomm/runtime/SharedBuffer.h
 create mode 100644 backends/qualcomm/runtime/backends/QnnMemManager.cpp
 create mode 100644 backends/qualcomm/runtime/backends/QnnMemManager.h

diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index db7e3890396..8883e5ee026 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -118,27 +118,29 @@ include_directories(
 #
 # declare targets
 #
+add_library(executorch_backend INTERFACE)
 add_library(qcir INTERFACE qcir_schema_output)
 add_library(qcir_utils STATIC)
-add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
-add_library(executorch_backend INTERFACE)
+add_library(qnn_backend STATIC)
+add_library(qnn_backend_cache STATIC)
+add_library(qnn_context STATIC)
+add_library(qnn_device STATIC)
 add_library(qnn_executorch_backend SHARED)
 add_library(qnn_executorch_header INTERFACE)
 add_library(qnn_executorch_logging STATIC)
-add_library(qnn_manager STATIC)
+add_library(qnn_factory STATIC)
 add_library(qnn_function_interface INTERFACE)
+add_library(qnn_graph STATIC)
+add_library(qnn_header INTERFACE)
 add_library(qnn_implementation STATIC)
-add_library(qnn_sys_function_interface INTERFACE)
-add_library(qnn_sys_implementation STATIC)
 add_library(qnn_logger STATIC)
+add_library(qnn_manager STATIC)
+add_library(qnn_mem_manager STATIC)
 add_library(qnn_profiler STATIC)
-add_library(qnn_device STATIC)
-add_library(qnn_context STATIC)
-add_library(qnn_backend_cache STATIC)
-add_library(qnn_graph STATIC)
-add_library(qnn_backend STATIC)
-add_library(qnn_factory STATIC)
-add_library(qnn_header INTERFACE)
+add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
+add_library(qnn_sys_function_interface INTERFACE)
+add_library(qnn_sys_implementation STATIC)
+add_library(shared_buffer STATIC)
 add_library(wrappers STATIC)
 add_library(utils STATIC)
 
@@ -220,6 +222,13 @@ target_link_libraries(qnn_graph
     qnn_context
     qnn_profiler
 )
+target_link_libraries(qnn_mem_manager
+    PRIVATE
+    qnn_executorch_logging
+    qnn_implementation
+    qnn_context
+)
+
 target_link_libraries(qnn_factory
     PUBLIC
     qnn_header
@@ -229,6 +238,7 @@ target_link_libraries(qnn_factory
     qnn_device
     qnn_context
     qnn_graph
+    qnn_mem_manager
 )
 target_link_libraries(qnn_manager
     PRIVATE
@@ -236,6 +246,7 @@ target_link_libraries(qnn_manager
     wrappers
     qnn_schema
     utils
+    shared_buffer
 )
 target_link_libraries(qnn_executorch_backend
     PRIVATE
@@ -249,7 +260,11 @@ target_link_libraries(utils
     PRIVATE
     qnn_executorch_logging
 )
-
+target_link_libraries(shared_buffer
+    PRIVATE
+    qnn_executorch_logging
+    ${CMAKE_DL_LIBS}
+)
 #
 # add linker option
 #
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
index 2a2cda84c55..9d80fd735aa 100644
--- a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
+++ b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
@@ -105,6 +105,7 @@ TensorWrapper::TensorWrapper(
 
 Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
   if (data != nullptr) {
+    QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
     QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
     if (copy_data) {
       owned_data_ = std::make_unique<char[]>(bytes_);
@@ -144,6 +145,12 @@ Error TensorWrapper::SetName(const std::string& name) {
   return Error::Ok;
 }
 
+Error TensorWrapper::SetMemHandle(Qnn_MemHandle_t mem_handle) {
+  QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+  QNN_VER_PTR(tensor_)->memHandle = mem_handle;
+  return Error::Ok;
+}
+
 // base function for Create TensorWrapper
 std::shared_ptr<TensorWrapper> CreateTensorWrapper(
     const std::string& tensor_name,
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.h b/backends/qualcomm/aot/wrappers/TensorWrapper.h
index 5c2be693486..c973196e9d5 100644
--- a/backends/qualcomm/aot/wrappers/TensorWrapper.h
+++ b/backends/qualcomm/aot/wrappers/TensorWrapper.h
@@ -59,16 +59,38 @@ class TensorWrapper {
     return QNN_VER_PTR(tensor_)->type == QNN_TENSOR_TYPE_STATIC;
   };
 
-  const void* GetStaticTensorData() const {
-    return QNN_VER_PTR(tensor_)->clientBuf.data;
+  std::uint32_t* GetDims() const {
+    return QNN_VER_PTR(tensor_)->dimensions;
+  };
+
+  Qnn_DataType_t GetDataType() const {
+    return QNN_VER_PTR(tensor_)->dataType;
+  };
+
+  Qnn_MemHandle_t const GetMemHandle() {
+    return QNN_VER_PTR(tensor_)->memHandle;
+  };
+
+  Qnn_TensorMemType_t GetMemType() const {
+    return QNN_VER_PTR(tensor_)->memType;
   };
 
   std::string GetName() const {
     return qnn_tensor_name_;
   };
 
+  std::uint32_t GetRank() const {
+    return QNN_VER_PTR(tensor_)->rank;
+  };
+
+  const void* GetStaticTensorData() const {
+    return QNN_VER_PTR(tensor_)->clientBuf.data;
+  };
+
   Error SetName(const std::string& name);
 
+  Error SetMemHandle(Qnn_MemHandle_t mem_handle);
+
  private:
   // need this to handle QNN_TENSOR_ERROR_NAME_HASH_COLLISION
   std::string qnn_tensor_name_;
diff --git a/backends/qualcomm/passes/insert_io_qdq.py b/backends/qualcomm/passes/insert_io_qdq.py
index e1dd55a916a..971e4895c36 100644
--- a/backends/qualcomm/passes/insert_io_qdq.py
+++ b/backends/qualcomm/passes/insert_io_qdq.py
@@ -38,6 +38,12 @@ def _ceate_args(self, target: torch.fx.node.Target, quant_attrs: Dict):
         arg_schemas = list(target._schema.arguments)[1:]
         for arg_schema in arg_schemas:
             name = arg_schema.name
+            # TODO: Due to the new parameter "out_dtype" in the dequantize node,
+            # it could not be found in the quant_attrs of other nodes,
+            # and it will cause a key error. For now, the output type
+            # of our dequantize node is only float. (by default in pytorch)
+            if name == "out_dtype":
+                continue
             value = quant_attrs[name]
             if type(arg_schema.type) == torch.tensor and type(value) in [int, float]:
                 value = torch.tensor(value)
diff --git a/backends/qualcomm/runtime/CMakeLists.txt b/backends/qualcomm/runtime/CMakeLists.txt
index 615c6320b5d..3a59c3ba2b3 100644
--- a/backends/qualcomm/runtime/CMakeLists.txt
+++ b/backends/qualcomm/runtime/CMakeLists.txt
@@ -47,3 +47,10 @@ target_sources(utils
     PRIVATE
     ${CMAKE_CURRENT_LIST_DIR}/Utils.cpp
 )
+
+# shared_buffer
+target_sources(shared_buffer
+    PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.h
+    ${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.cpp
+)
diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h
index e3c76742e2a..d54de1059d7 100644
--- a/backends/qualcomm/runtime/QnnExecuTorch.h
+++ b/backends/qualcomm/runtime/QnnExecuTorch.h
@@ -8,8 +8,10 @@
 #pragma once
 
 #ifdef __cplusplus
+#include <cstddef>
 #include <cstdint>
 #else
+#include <stddef.h>
 #include <stdint.h>
 #endif
 
@@ -31,6 +33,16 @@ typedef struct {
   }
 // clang-format on
 
+/// Allocate specific tensors (usually graph inputs and outputs) on shared
+/// memory. Users are responsible to allocate "enough" tensor bytes, and set
+/// alignment as MemoryAllocator::kDefaultAlignment.
+/// See runtime/core/memory_allocator.h. The function returns a valid pointer
+/// if allocation is successful.
+void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
+
+/// Free the allocated shared memory.
+void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index b093c274c38..77449703c5f 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -188,9 +188,14 @@ Error QnnExecuTorchBackend::execute(
   std::vector<Qnn_Tensor_t> input_tensor_structs;
   std::vector<Qnn_Tensor_t> output_tensor_structs;
 
+  input_tensor_structs.reserve(input_tensors.size());
   for (int i = 0; i < input_tensors.size(); ++i) {
-    input_tensors[i]->FillDataBuffer(
-        args[i]->toTensor().const_data_ptr(), true /* copy_data */);
+    if (qnn_manager->RegisterMem(
+            args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
+        Error::Ok) {
+      input_tensors[i]->FillDataBuffer(
+          args[i]->toTensor().const_data_ptr(), true /* copy_data */);
+    }
     input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
   }
 
@@ -198,9 +203,12 @@ Error QnnExecuTorchBackend::execute(
   for (const auto& output_tensor : output_tensors) {
     // pos=0 limits the search to the prefix
     if (output_tensor->GetName().rfind("output_", 0) == 0) {
-      output_tensor->FillDataBuffer(
-          args[output_index]->toTensor().mutable_data_ptr(),
-          false /* copy_data */);
+      void* mutable_data_ptr =
+          args[output_index]->toTensor().mutable_data_ptr();
+      if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) !=
+          Error::Ok) {
+        output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */);
+      }
       output_index++;
     }
     output_tensor_structs.push_back(output_tensor->CloneTensorStruct());
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 3303a08309d..dc3217fc1c8 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
+#include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
 #include <executorch/backends/qualcomm/runtime/Utils.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
-
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
@@ -54,7 +54,9 @@ QnnManager::QnnManager(
         "the size of qnn context binary: %d",
         qnn_executorch_context_binary.nbytes);
     QNN_EXECUTORCH_LOG_INFO(
-        "Is on-device graph construction: %d", options_->online_prepare());
+        "Is on-device graph construction: %d", options->online_prepare());
+    QNN_EXECUTORCH_LOG_INFO(
+        "Enable shared buffer: %d", options->shared_buffer());
   }
 
   if (library_path.empty()) {
@@ -82,6 +84,53 @@ Error QnnManager::LoadQnnLibrary() {
   return ret;
 }
 
+Error QnnManager::RegisterMem(
+    void* data_ptr,
+    const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
+  SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
+  // Not enable shared buffer
+  if (!options_->shared_buffer())
+    return Error::Internal;
+
+  if (backend_params_ptr_->qnn_mem_manager_ptr_ == nullptr) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Backend %s doesn't supported shared buffer.",
+        EnumNameQnnExecuTorchBackendType(
+            options_->backend_options()->backend_type()));
+    return Error::Internal;
+  }
+
+  if (!shared_buffer_manager.IsAllocated(data_ptr)) {
+    // It means two scenarios here:
+    // 1. the input and output partitioned graph
+    // 2. Actually, user doesn't allocate shared buffer with
+    // QnnExecuTorchAllocCustomMem API
+    return Error::Internal;
+  } else if (backend_params_ptr_->qnn_mem_manager_ptr_->IsRegistered(
+                 tensor_wrapper->GetMemHandle())) {
+    if (options_->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo)
+      QNN_EXECUTORCH_LOG_INFO(
+          "Tensor name %s has been registered shared memory.",
+          tensor_wrapper->GetName().c_str());
+    return Error::Ok;
+  }
+
+  int32_t mem_fd = SharedBuffer::GetSharedBufferManager().MemToFd(data_ptr);
+  if (mem_fd == -1) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Tensor name %s is failed to get file descriptor.",
+        tensor_wrapper->GetName().c_str());
+    return Error::Internal;
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      backend_params_ptr_->qnn_mem_manager_ptr_->RegisterMem(
+          tensor_wrapper, mem_fd) == Error::Ok,
+      Internal,
+      "Fail to register to shared memory.");
+
+  return Error::Ok;
+}
+
 Error QnnManager::Init() {
   ET_CHECK_OR_RETURN_ERROR(
       LoadQnnLibrary() == Error::Ok, Internal, "Fail to load Qnn library");
@@ -219,14 +268,6 @@ void QnnManager::Destroy() {
   qnn_loaded_backend_.TerminateAllBackends();
 }
 
-bool QnnManager::IsAvailable() {
-  return true;
-}
-
-bool QnnManager::IsOnlinePrepare() {
-  return options_->online_prepare();
-}
-
 bool QnnManager::IsNodeSupportedByBackend(
     std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -329,3 +370,14 @@ Error QnnManager::Compile(
 } // namespace qnn
 } // namespace executor
 } // namespace torch
+void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
+  using torch::executor::qnn::SharedBuffer;
+  void* buffer_ptr =
+      SharedBuffer::GetSharedBufferManager().AllocMem(bytes, alignment);
+  return buffer_ptr;
+}
+
+void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
+  using torch::executor::qnn::SharedBuffer;
+  SharedBuffer::GetSharedBufferManager().FreeMem(buffer_ptr);
+}
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index a0a5b35e14d..639d3534de4 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -42,14 +42,18 @@ class QnnManager {
 
   void Destroy();
 
-  bool IsAvailable();
+  bool IsAvailable() {
+    return true;
+  }
+
+  bool IsOnlinePrepare() {
+    return options_->online_prepare();
+  }
 
   bool IsTensorDump() {
     return options_->tensor_dump_output_path()->size() > 0;
   }
 
-  bool IsOnlinePrepare();
-
   bool IsNodeSupportedByBackend(
       std::vector<std::shared_ptr<OpWrapper>>& op_wrappers);
 
@@ -57,6 +61,10 @@ class QnnManager {
       std::vector<std::shared_ptr<OpWrapper>>& op_wrappers,
       QnnExecuTorchContextBinary& qnn_executorch_context_binary);
 
+  Error RegisterMem(
+      void* data_ptr,
+      const std::shared_ptr<TensorWrapper>& tensor_wrapper);
+
   std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs() {
     return input_tensors_;
   }
diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
new file mode 100644
index 00000000000..423c5d63723
--- /dev/null
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <dlfcn.h>
+#include <executorch/backends/qualcomm/runtime/Logging.h>
+#include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
+
+// Refer to the QNN HTP Shared Buffer Tutorial
+// in Qualcomm® AI Engine Direct document
+constexpr uint8_t RPCMEM_HEAP_ID_SYSTEM = 25;
+constexpr uint8_t RPCMEM_DEFAULT_FLAGS = 1;
+
+namespace torch {
+namespace executor {
+namespace qnn {
+
+namespace {
+
+intptr_t alignTo(size_t alignment, intptr_t offset) {
+  return offset % alignment == 0 ? offset
+                                 : offset +
+          (static_cast<intptr_t>(alignment) -
+           offset % static_cast<intptr_t>(alignment));
+}
+
+} // namespace
+
+std::mutex SharedBuffer::init_mutex_;
+
+SharedBuffer& SharedBuffer::GetSharedBufferManager() {
+  std::lock_guard<std::mutex> lk(init_mutex_);
+  static SharedBuffer shared_buffer_manager;
+  if (!shared_buffer_manager.GetInitialize()) {
+    Error status = shared_buffer_manager.Load();
+    if (status == Error::Ok) {
+      shared_buffer_manager.SetInitialize(true);
+    }
+  }
+  return shared_buffer_manager;
+}
+
+SharedBuffer::~SharedBuffer() {
+  if (initialize_) {
+    SharedBuffer::GetSharedBufferManager().UnLoad();
+  }
+};
+
+void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
+  if (!initialize_) {
+    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
+    return nullptr;
+  }
+  // do alignment:
+  auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
+  void* buf = rpc_mem_alloc_(
+      RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
+  if (buf == nullptr) {
+    QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory.");
+    return nullptr;
+  }
+  auto aligned_buf = reinterpret_cast<void*>(
+      alignTo(alignment, reinterpret_cast<intptr_t>(buf)));
+  bool status =
+      restore_map_.insert(std::pair<void*, void*>(aligned_buf, buf)).second;
+  if (!status) {
+    QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory.");
+    rpc_mem_free_(buf);
+  }
+  return aligned_buf;
+}
+
+int32_t SharedBuffer::MemToFd(void* buf) {
+  int32_t memFd = -1;
+  if (!initialize_) {
+    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
+  } else {
+    memFd = rpc_mem_to_fd_(buf);
+  }
+  return memFd;
+}
+
+void SharedBuffer::FreeMem(void* buf) {
+  if (!initialize_) {
+    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
+  } else if (restore_map_.count(buf) == 0) {
+    QNN_EXECUTORCH_LOG_WARN("Don't free an unallocated tensor.");
+  } else {
+    rpc_mem_free_(restore_map_[buf]);
+    restore_map_.erase(buf);
+  }
+}
+
+bool SharedBuffer::IsAllocated(void* buf) {
+  return restore_map_.count(buf) != 0U;
+}
+
+Error SharedBuffer::Load() {
+  // On Android, 32-bit and 64-bit libcdsprpc.so can be found at /vendor/lib/
+  // and /vendor/lib64/ respectively.
+  lib_cdsp_rpc_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+  if (lib_cdsp_rpc_ == nullptr) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Unable to load shared buffer. dlerror(): %s", dlerror());
+    return Error::Internal;
+  }
+  rpc_mem_alloc_ = reinterpret_cast<RpcMemAllocFn_t>( // NOLINT
+      dlsym(lib_cdsp_rpc_, "rpcmem_alloc"));
+  rpc_mem_free_ = reinterpret_cast<RpcMemFreeFn_t>( // NOLINT
+      dlsym(lib_cdsp_rpc_, "rpcmem_free"));
+  rpc_mem_to_fd_ = reinterpret_cast<RpcMemToFdFn_t>( // NOLINT
+      dlsym(lib_cdsp_rpc_, "rpcmem_to_fd"));
+  if (nullptr == rpc_mem_alloc_ || nullptr == rpc_mem_free_ ||
+      nullptr == rpc_mem_to_fd_) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Unable to access symbols in shared buffer. dlerror(): %s", dlerror());
+    dlclose(lib_cdsp_rpc_);
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+Error SharedBuffer::UnLoad() {
+  if (dlclose(lib_cdsp_rpc_) != 0) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Unable to close shared buffer. dlerror(): %s", dlerror());
+    return Error::Internal;
+  };
+  return Error::Ok;
+}
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/SharedBuffer.h b/backends/qualcomm/runtime/SharedBuffer.h
new file mode 100644
index 00000000000..1803e8af879
--- /dev/null
+++ b/backends/qualcomm/runtime/SharedBuffer.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <executorch/runtime/core/error.h>
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+using RpcMemAllocFn_t = void* (*)(int, uint32_t, int);
+using RpcMemFreeFn_t = void (*)(void*);
+using RpcMemToFdFn_t = int (*)(void*);
+
+namespace torch {
+namespace executor {
+namespace qnn {
+class SharedBuffer final {
+ public:
+  SharedBuffer(const SharedBuffer&) = delete;
+  SharedBuffer& operator=(const SharedBuffer&) = delete;
+  SharedBuffer(SharedBuffer&&) = delete;
+  SharedBuffer& operator=(SharedBuffer&&) = delete;
+  ~SharedBuffer();
+
+  static SharedBuffer& GetSharedBufferManager();
+  void* AllocMem(size_t bytes, size_t alignment);
+  // map a buffer allocated via RPCMem to a file descriptor so it can be
+  // registered with a backend via QnnMem_register()
+  int32_t MemToFd(void* buf);
+
+  void FreeMem(void* buf);
+
+  bool IsAllocated(void* buf);
+
+  bool GetInitialize() {
+    return initialize_;
+  }
+  void SetInitialize(bool initialize) {
+    initialize_ = initialize;
+  }
+
+ private:
+  SharedBuffer() = default;
+
+  // dlopen RPCMem library and dlysm required functions
+  Error Load();
+
+  Error UnLoad();
+
+  // Pointer to the dlopen'd libcdsprpc.so shared library which contains
+  // rpcmem_alloc, rpcmem_free, rpcmem_to_fd APIs
+  void* lib_cdsp_rpc_;
+  // Function pointer to rpcmem_alloc
+  RpcMemAllocFn_t rpc_mem_alloc_;
+  // Function pointer to rpcmem_free
+  RpcMemFreeFn_t rpc_mem_free_;
+  // Function pointer to rpcmem_to_fd
+  RpcMemToFdFn_t rpc_mem_to_fd_;
+  std::unordered_map<void*, void*> restore_map_;
+  std::atomic_bool initialize_{false};
+  static std::mutex init_mutex_;
+};
+
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt
index 65871d22e14..6541989be15 100644
--- a/backends/qualcomm/runtime/backends/CMakeLists.txt
+++ b/backends/qualcomm/runtime/backends/CMakeLists.txt
@@ -109,6 +109,14 @@ target_sources(qnn_backend
     ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCommon.cpp
 )
 
+# qnn_mem_manager
+target_sources(qnn_mem_manager
+    PUBLIC
+    ${CMAKE_CURRENT_LIST_DIR}/QnnMemManager.h
+    PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/QnnMemManager.cpp
+)
+
 # qnn_factory
 target_sources(qnn_factory
     PUBLIC
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index d90f850386a..acb95524682 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -69,6 +69,8 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           options->graph_name()->str(),
           options->soc_info(),
           htp_options);
+      backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
+          implementation, backend_params->qnn_context_ptr_.get());
       backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED;
       return backend_params;
     } break;
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
index bfed40d9aaa..ab47113a538 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
@@ -14,6 +14,7 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnGraphCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnLogger.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnMemManager.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpDevice.h>
@@ -33,6 +34,7 @@ typedef struct BackendConfigParameters {
   std::unique_ptr<QnnContext> qnn_context_ptr_;
   std::unique_ptr<QnnDevice> qnn_device_ptr_;
   std::unique_ptr<QnnGraph> qnn_graph_ptr_;
+  std::unique_ptr<QnnMemManager> qnn_mem_manager_ptr_;
 
   // Default ctor
   BackendConfigParameters()
@@ -40,10 +42,12 @@ typedef struct BackendConfigParameters {
         backend_init_state_(BackendInitializeState::UNINITIALIZED),
         qnn_context_ptr_(nullptr),
         qnn_device_ptr_(nullptr),
-        qnn_graph_ptr_(nullptr) {}
+        qnn_graph_ptr_(nullptr),
+        qnn_mem_manager_ptr_(nullptr) {}
   // Default dtor
   ~BackendConfigParameters() {
     qnn_graph_ptr_.reset();
+    qnn_mem_manager_ptr_.reset();
     qnn_context_ptr_.reset();
     qnn_device_ptr_.reset();
     qnn_backend_ptr_.reset();
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.cpp b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
new file mode 100644
index 00000000000..8f8317e0136
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/backends/qualcomm/runtime/backends/QnnMemManager.h>
+
+namespace torch {
+namespace executor {
+namespace qnn {
+
+bool QnnMemManager::IsRegistered(Qnn_MemHandle_t handle) {
+  return registered_set_.count(handle) != 0U;
+}
+
+Error QnnMemManager::RegisterMem(
+    const std::shared_ptr<TensorWrapper>& tensor_wrapper,
+    int32_t mem_fd) {
+  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
+  Qnn_MemDescriptor_t descriptor = {
+      {tensor_wrapper->GetRank(), tensor_wrapper->GetDims(), nullptr},
+      tensor_wrapper->GetDataType(),
+      QNN_MEM_TYPE_ION,
+      {{mem_fd}}};
+  Qnn_MemHandle_t handle = nullptr;
+  Qnn_ErrorHandle_t error = QNN_SUCCESS;
+  error = qnn_interface.qnn_mem_register(
+      context_->GetHandle(),
+      &descriptor,
+      /*numDescriptors=*/1,
+      &handle);
+  if (error != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Tensor %s is failed to register shared memory. Error %d",
+        tensor_wrapper->GetName().c_str(),
+        QNN_GET_ERROR_CODE(error));
+    return Error::Internal;
+  }
+  tensor_wrapper->SetMemHandle(handle);
+  registered_set_.insert(handle);
+  QNN_EXECUTORCH_LOG_INFO(
+      "Tensor %s is successfully registered to shared memory.",
+      tensor_wrapper->GetName().c_str());
+  return Error::Ok;
+}
+
+void QnnMemManager::DeRegisterMem() {
+  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
+  Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+  for (auto& mem_handle : registered_set_) {
+    error = qnn_interface.qnn_mem_de_register(&mem_handle, /*numHandles=*/1);
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Failed to de-register shared memory. Error %d",
+          QNN_GET_ERROR_CODE(error));
+    }
+  }
+  registered_set_.clear();
+}
+
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h
new file mode 100644
index 00000000000..9d5949db16a
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <executorch/backends/qualcomm/aot/wrappers/TensorWrapper.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
+#include <unordered_set>
+
+namespace torch {
+namespace executor {
+namespace qnn {
+
+class QnnMemManager {
+ public:
+  explicit QnnMemManager(
+      const QnnImplementation& implementation,
+      QnnContext* context)
+      : implementation_(implementation), context_(context) {}
+  ~QnnMemManager() {
+    DeRegisterMem();
+  }
+
+  Error RegisterMem(
+      const std::shared_ptr<TensorWrapper>& tensor_wrapper,
+      int32_t mem_fd);
+
+  bool IsRegistered(Qnn_MemHandle_t handle);
+
+ private:
+  void DeRegisterMem();
+
+  const QnnImplementation& implementation_;
+  QnnContext* context_;
+  std::unordered_set<Qnn_MemHandle_t> registered_set_;
+};
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
index b3b70328ae0..0f926fc0975 100644
--- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py
+++ b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
@@ -131,3 +131,4 @@ class QnnExecuTorchOptions:
     online_prepare: bool = False
     tensor_dump_output_path: str = ""
     profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
+    shared_buffer: bool = False
diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs
index c19bf681bbf..8c4d23172f0 100644
--- a/backends/qualcomm/serialization/schema.fbs
+++ b/backends/qualcomm/serialization/schema.fbs
@@ -172,6 +172,9 @@ table QnnExecuTorchOptions {
 
   /// Profiling level of the delegate and the backend. Default is off.
   profile_level:QnnExecuTorchProfileLevel;
+  
+  /// Enables usage of shared buffer between application and backend for graph I/O.
+  shared_buffer:bool;
 }
 
 root_type QnnExecuTorchOptions;
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index e36c6e5ecd8..66a3ad5c613 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -56,6 +56,7 @@ def setUp(self):
             online_prepare=TestQNN.online_prepare,
             tensor_dump_output_path="",
             profile=TestQNN.enable_profile,
+            shared_buffer=TestQNN.shared_buffer,
         )
 
     def test_qnn_backend_arange(self):
@@ -389,6 +390,7 @@ def setUp(self):
             online_prepare=TestQNN.online_prepare,
             tensor_dump_output_path="",
             profile=TestQNN.enable_profile,
+            shared_buffer=TestQNN.shared_buffer,
         )
 
     def test_qnn_backend_conv1d_relu_log_softmax(self):
@@ -484,6 +486,7 @@ def setUp(self):
             online_prepare=TestQNN.online_prepare,
             tensor_dump_output_path="",
             profile=TestQNN.enable_profile,
+            shared_buffer=TestQNN.shared_buffer,
         )
 
     def test_qnn_backend_16a4w_conv2d(self):
@@ -880,6 +883,7 @@ def setUp(self):
             online_prepare=TestQNN.online_prepare,
             tensor_dump_output_path="",
             profile=TestQNN.enable_profile,
+            shared_buffer=TestQNN.shared_buffer,
         )
 
     def test_qnn_backend_conv1d_relu_log_softmax(self):
@@ -1077,6 +1081,24 @@ def test_qnn_backend_profile_op(self):
             expected_profile_events=25,
         )
 
+    def test_qnn_backend_shared_buffer(self):
+        TestQNN.shared_buffer = True
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=True,
+        )
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            shared_buffer=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+        )
+
 
 class TestQNNQuantizedUtils(TestQNN):
     # TODO: refactor to support different backends
@@ -1179,6 +1201,25 @@ def test_qnn_backend_profile_op(self):
             expected_profile_events=26,
         )
 
+    def test_qnn_backend_shared_buffer(self):
+        TestQNN.shared_buffer = True
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+        )
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            shared_buffer=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+        )
+
 
 class TestExampleScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
@@ -1215,6 +1256,8 @@ def test_mobilenet_v2(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1248,6 +1291,8 @@ def test_inception_v3(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1281,6 +1326,8 @@ def test_inception_v4(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1314,6 +1361,8 @@ def test_vit(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1346,6 +1395,8 @@ def test_edsr(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1378,6 +1429,8 @@ def test_deeplab_v3(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1411,6 +1464,8 @@ def test_dummy_llama2(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1442,6 +1497,8 @@ def test_ptq_dummy_llama2(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1475,6 +1532,8 @@ def test_mobilebert(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1515,6 +1574,8 @@ def test_ptq_mobilebert(self):
         ]
         if self.host:
             cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
 
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
@@ -1585,6 +1646,7 @@ def setup_environment():
     TestQNN.online_prepare = args.online_prepare
     TestQNN.enable_profile = args.enable_profile
     TestQNN.error_only = args.error_only
+    TestQNN.shared_buffer = args.shared_buffer
     return sys.argv[:1] + ns_args
 
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index dc0da7f75dc..ee7d6a7a3b6 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -32,6 +32,7 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.exir.program._program import ExecutorchProgram
 from executorch.sdk import generate_etrecord
 from executorch.sdk.inspector import Inspector
@@ -64,6 +65,7 @@ class TestQNN(unittest.TestCase):
     use_8a8w: str = "8a8w"
     use_16a16w: str = "16a16w"
     use_16a4w: str = "16a4w"
+    shared_buffer: bool = False
 
     def _assert_outputs_equal(self, model_output, ref_output):
         self.assertTrue(len(ref_output) == len(model_output))
@@ -183,7 +185,19 @@ def lower_module_and_test_output(
         delegated_program.exported_program = to_backend(
             delegated_program.exported_program, qnn_partitioner
         )
-        exec_prog = delegated_program.to_executorch()
+        exec_prog = delegated_program.to_executorch(
+            exir.ExecutorchBackendConfig(
+                # For shared buffer, user must pass the memory address
+                # which is allocated by RPC memory to executor runner.
+                # Therefore, won't want to pre-allocate
+                # by memory manager in runtime.
+                memory_planning_pass=MemoryPlanningPass(
+                    memory_planning_algo="greedy",
+                    alloc_graph_input=not self.shared_buffer,
+                    alloc_graph_output=not self.shared_buffer,
+                )
+            )
+        )
 
         # Assert the backend name is qnn
         self.assertEqual(
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 1af9572bd3b..7fa696efbac 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -190,6 +190,7 @@ def generate_qnn_executorch_compiler_spec(
     online_prepare: bool = False,
     tensor_dump_output_path: str = "",
     profile: bool = False,
+    shared_buffer: bool = False,
 ) -> List[CompileSpec]:
     """
     Helper function generating compiler specs for Qualcomm AI Engine Direct
@@ -215,6 +216,8 @@ def generate_qnn_executorch_compiler_spec(
         profile: Enable profile the performance of per operator.
             Note that for now only support kProfileDetailed to
             profile the performance of each operator with cycle unit.
+        shared_buffer: Enables usage of shared buffer between application
+            and backend for graph I/O.
 
     Returns:
         List[CompileSpec]: Compiler specs for Qualcomm AI Engine Direct.
@@ -250,6 +253,9 @@ def generate_qnn_executorch_compiler_spec(
     else:
         qnn_executorch_options.profile_level = QnnExecuTorchProfileLevel.kProfileOff
 
+    if shared_buffer:
+        qnn_executorch_options.shared_buffer = True
+
     if (
         online_prepare
         and backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index 905deca6445..54772f5c781 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -100,7 +100,7 @@ target_link_libraries(qnn_executor_runner
     qnn_executorch_backend
     full_portable_ops_lib
     etdump
-    ${FLATCC_LIB}
+    ${FLATCCRT_LIB}
     gflags
 )
 target_compile_options(qnn_executor_runner
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 0b13122e961..bd18cdc16b1 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -17,7 +17,10 @@
  * Currently we assume that the outputs are all fp32 tensors.
  */
 
+#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
@@ -25,6 +28,7 @@
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/sdk/etdump/etdump_flatcc.h>
 #include <executorch/util/util.h>
+
 #include <gflags/gflags.h>
 
 #include <fstream>
@@ -47,14 +51,55 @@ DEFINE_string(
 DEFINE_string(input_list_path, "input_list.txt", "Model input list path.");
 DEFINE_int32(iteration, 1, "Iterations of inference.");
 DEFINE_int32(warm_up, 0, "Pre-run before inference.");
+DEFINE_bool(
+    shared_buffer,
+    false,
+    "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend.");
 
 DEFINE_string(
     etdump_path,
     "etdump.etdp",
     "If etdump generation is enabled an etdump will be written out to this path");
 using namespace torch::executor;
+using torch::executor::MemoryAllocator;
 using torch::executor::util::FileDataLoader;
 
+class CustomMemory {
+ public:
+  CustomMemory(bool shared_buffer) : shared_buffer_(shared_buffer){};
+  bool Allocate(size_t bytes, size_t alignment) {
+    if (shared_buffer_) {
+      ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment);
+    } else {
+      input_data_.resize(bytes);
+      ptr_ = input_data_.data();
+    }
+    return ptr_ != nullptr;
+  }
+
+  ~CustomMemory() {
+    if (shared_buffer_) {
+      if (ptr_ != nullptr) {
+        QnnExecuTorchFreeCustomMem(ptr_);
+      }
+    }
+  }
+
+  void* GetPtr() {
+    return ptr_;
+  }
+
+  CustomMemory(const CustomMemory&) = delete;
+  CustomMemory(CustomMemory&&) = delete;
+  CustomMemory& operator=(const CustomMemory&) = delete;
+  CustomMemory& operator=(CustomMemory&&) = delete;
+
+ private:
+  bool shared_buffer_{false};
+  void* ptr_{nullptr};
+  std::vector<char> input_data_;
+};
+
 int main(int argc, char** argv) {
   runtime_init();
 
@@ -167,10 +212,58 @@ int main(int argc, char** argv) {
   ET_LOG(Info, "Method loaded.");
 
   // Prepare the inputs.
-  // Use ones-initialized inputs.
-  auto inputs = util::PrepareInputTensors(*method);
+  // Allocate data memory for inputs and outputs
+  std::vector<std::unique_ptr<CustomMemory>> in_custom_mem;
+  std::vector<std::unique_ptr<CustomMemory>> out_custom_mem;
+  in_custom_mem.reserve(method->inputs_size());
+  out_custom_mem.reserve(method->outputs_size());
+
+  for (int input_index = 0; input_index < method->inputs_size();
+       ++input_index) {
+    MethodMeta method_meta = method->method_meta();
+    Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(input_index);
+    in_custom_mem.push_back(
+        std::make_unique<CustomMemory>(FLAGS_shared_buffer));
+    std::unique_ptr<CustomMemory>& custom_mem_ptr = in_custom_mem.back();
+    ET_CHECK_MSG(
+        custom_mem_ptr->Allocate(
+            tensor_meta->nbytes(), MemoryAllocator::kDefaultAlignment),
+        "Failed to allocate custom memory. tensor index: %d, bytes: %zu",
+        input_index,
+        tensor_meta->nbytes());
+    TensorImpl impl = TensorImpl(
+        tensor_meta->scalar_type(),
+        /*dim=*/tensor_meta->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
+        custom_mem_ptr->GetPtr(),
+        const_cast<TensorImpl::DimOrderType*>(tensor_meta->dim_order().data()));
+    Error ret = method->set_input(Tensor(&impl), input_index);
+    ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret);
+  }
+  for (int output_index = 0; output_index < method->outputs_size();
+       ++output_index) {
+    const exec_aten::Tensor& t = method->get_output(output_index).toTensor();
+    out_custom_mem.push_back(
+        std::make_unique<CustomMemory>(FLAGS_shared_buffer));
+    std::unique_ptr<CustomMemory>& custom_mem_ptr = out_custom_mem.back();
+    ET_CHECK_MSG(
+        custom_mem_ptr->Allocate(
+            t.nbytes(), MemoryAllocator::kDefaultAlignment),
+        "Failed to allocate custom memory. tensor index: %d, bytes: %zu",
+        output_index,
+        t.nbytes());
+    Error ret = method->set_output_data_ptr(
+        custom_mem_ptr->GetPtr(), t.nbytes(), output_index);
+    if (ret != Error::Ok) {
+      // This can error if the outputs are already pre-allocated. Ignore
+      // this error because it doesn't affect correctness, but log it.
+      ET_LOG(
+          Error, "ignoring error from set_output_data_ptr(): 0x%" PRIx32, ret);
+    }
+  }
   ET_LOG(Info, "Inputs prepared.");
 
+  // Fill in data for input
   std::ifstream input_list(FLAGS_input_list_path);
   if (input_list.is_open()) {
     size_t num_inputs = method->inputs_size();
@@ -205,31 +298,38 @@ int main(int argc, char** argv) {
           input_files.size());
 
       for (int input_index = 0; input_index < num_inputs; ++input_index) {
-        exec_aten::Tensor& t = method->mutable_input(input_index).toTensor();
-        std::vector<char> input_data(t.nbytes());
+        MethodMeta method_meta = method->method_meta();
+        Result<TensorInfo> tensor_meta =
+            method_meta.input_tensor_meta(input_index);
+
         std::ifstream fin(input_files[input_index], std::ios::binary);
         fin.seekg(0, fin.end);
         size_t file_size = fin.tellg();
 
         ET_CHECK_MSG(
-            file_size == t.nbytes(),
+            file_size == tensor_meta->nbytes(),
             "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu",
             input_index,
             file_size,
-            t.nbytes());
+            tensor_meta->nbytes());
 
         fin.seekg(0, fin.beg);
-        fin.read(input_data.data(), file_size);
+        fin.read(
+            static_cast<char*>(in_custom_mem[input_index]->GetPtr()),
+            file_size);
         fin.close();
 
-        std::vector<TensorImpl::SizesType> sizes(t.dim());
-        for (int i = 0; i < sizes.size(); ++i) {
-          sizes[i] = t.sizes().data()[i];
-        }
-
-        auto t_impl = TensorImpl(
-            t.scalar_type(), t.dim(), sizes.data(), input_data.data());
-        Error ret = method->set_input(EValue(Tensor(&t_impl)), input_index);
+        // For pre-allocated use case, we need to call set_input
+        // to copy data for the input tensors since they doesn't
+        // share the data with in_custom_mem.
+        TensorImpl impl = TensorImpl(
+            tensor_meta->scalar_type(),
+            /*dim=*/tensor_meta->sizes().size(),
+            const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
+            in_custom_mem[input_index]->GetPtr(),
+            const_cast<TensorImpl::DimOrderType*>(
+                tensor_meta->dim_order().data()));
+        Error ret = method->set_input(Tensor(&impl), input_index);
         ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret);
       }
 
@@ -313,21 +413,5 @@ int main(int argc, char** argv) {
     ET_LOG(Info, "Model executed successfully.");
   }
 
-  // Dump the etdump data containing profiling/debugging data to the specified
-  // file.
-  etdump_result result = etdump_gen.get_etdump_data();
-  if (result.buf != nullptr && result.size > 0) {
-    ET_LOG(
-        Info,
-        "Write etdump to %s, Size = %zu",
-        FLAGS_etdump_path.c_str(),
-        result.size);
-    FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
-    fwrite((uint8_t*)result.buf, 1, result.size, f);
-    fclose(f);
-    free(result.buf);
-  }
-
-  util::FreeInputs(inputs);
   return 0;
 }
diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py
index 133e64d8568..4e08ab078c2 100755
--- a/examples/qualcomm/scripts/deeplab_v3.py
+++ b/examples/qualcomm/scripts/deeplab_v3.py
@@ -109,6 +109,7 @@ def get_dataset(data_size, dataset_dir, download):
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -128,6 +129,7 @@ def get_dataset(data_size, dataset_dir, download):
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/dummy_llama2.py b/examples/qualcomm/scripts/dummy_llama2.py
index dd37f816004..8178ae5a5a4 100755
--- a/examples/qualcomm/scripts/dummy_llama2.py
+++ b/examples/qualcomm/scripts/dummy_llama2.py
@@ -128,6 +128,7 @@ def create_device_inputs(example_inputs, use_kv_cache):
         inputs,
         custom_annotations=(),
         quant_dtype=quant_dtype,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -141,6 +142,7 @@ def create_device_inputs(example_inputs, use_kv_cache):
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
index f844b094c03..50639d41894 100755
--- a/examples/qualcomm/scripts/edsr.py
+++ b/examples/qualcomm/scripts/edsr.py
@@ -156,6 +156,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -175,6 +176,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index e93e13ac33f..cdb84f6e8c6 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -12,6 +12,7 @@
 )
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
+    generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
 from executorch.examples.models import MODEL_NAME_TO_MODEL
@@ -71,12 +72,13 @@
     edge_copy = copy.deepcopy(edge_program)
 
     # Delegate to QNN backend
+    backend_options = generate_htp_compiler_spec(
+        use_fp16=False,
+    )
     qnn_partitioner = QnnPartitioner(
         generate_qnn_executorch_compiler_spec(
-            is_fp16=False,
             soc_model=QcomChipset.SM8550,
-            debug=False,
-            saver=False,
+            backend_options=backend_options,
         )
     )
     with validation_disabled():
diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py
index 244e38edbe5..a3b5c41923d 100755
--- a/examples/qualcomm/scripts/inception_v3.py
+++ b/examples/qualcomm/scripts/inception_v3.py
@@ -111,6 +111,7 @@ def get_data_loader():
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -130,6 +131,7 @@ def get_data_loader():
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py
index db3feda2708..06b8047a18c 100755
--- a/examples/qualcomm/scripts/inception_v4.py
+++ b/examples/qualcomm/scripts/inception_v4.py
@@ -110,6 +110,7 @@ def get_data_loader():
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -129,6 +130,7 @@ def get_data_loader():
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index dc148afa8eb..84d130d4244 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -294,6 +294,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=quant_dtype,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -313,6 +314,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py
index 5f214a6f8ca..e389c00b3ec 100755
--- a/examples/qualcomm/scripts/mobilenet_v2.py
+++ b/examples/qualcomm/scripts/mobilenet_v2.py
@@ -111,6 +111,7 @@ def get_data_loader():
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
@@ -130,6 +131,7 @@ def get_data_loader():
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py
index ff22f93c4f4..63e1480b625 100755
--- a/examples/qualcomm/scripts/torchvision_vit.py
+++ b/examples/qualcomm/scripts/torchvision_vit.py
@@ -150,6 +150,7 @@ def get_data_loader():
         f"{args.artifact}/{pte_filename}",
         inputs,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
     # setup required paths accordingly
     # qnn_sdk       : QNN SDK path setup in environment variable
@@ -165,6 +166,7 @@ def get_data_loader():
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
+        shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
index c815867f2d6..4f8e5b419c6 100755
--- a/examples/qualcomm/scripts/utils.py
+++ b/examples/qualcomm/scripts/utils.py
@@ -32,6 +32,7 @@
 )
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.capture._config import ExecutorchBackendConfig
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -46,6 +47,7 @@ def __init__(
         soc_model,
         host_id=None,
         error_only=False,
+        shared_buffer=False,
     ):
         self.qnn_sdk = qnn_sdk
         self.artifact_path = artifact_path
@@ -65,6 +67,7 @@ def __init__(
         }
         self.soc_model = arch_table[soc_model]
         self.error_only = error_only
+        self.shared_buffer = shared_buffer
 
     def _adb(self, cmd):
         if not self.host_id:
@@ -123,6 +126,7 @@ def execute(self):
                 f"--output_folder_path {self.output_folder}",
                 f"--input_list_path {self.input_list_filename}",
                 f"--etdump_path {self.etdump_path}",
+                "--shared_buffer" if self.shared_buffer else "",
             ]
         )
         qnn_executor_runner_cmds = " ".join(
@@ -157,6 +161,7 @@ def build_executorch_binary(
     skip_node_id_set=None,
     skip_node_op_set=None,
     quant_dtype: Optional[QuantDtype] = None,
+    shared_buffer=False,
 ):
     if quant_dtype:
         quantizer = QnnQuantizer()
@@ -202,6 +207,7 @@ def build_executorch_binary(
             backend_options=backend_options,
             debug=False,
             saver=False,
+            shared_buffer=shared_buffer,
         ),
         skip_node_id_set,
         skip_node_op_set,
@@ -209,7 +215,18 @@ def build_executorch_binary(
     edge_prog.exported_program = to_backend(edge_prog.exported_program, qnn_partitioner)
     edge_prog.exported_program.graph_module.graph.print_tabular()
     exec_prog = edge_prog.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_constant_segment=False,
+            # For shared buffer, user must pass the memory address
+            # which is allocated by RPC memory to executor runner.
+            # Therefore, won't want to pre-allocate
+            # by memory manager in runtime.
+            memory_planning_pass=MemoryPlanningPass(
+                memory_planning_algo="greedy",
+                alloc_graph_input=not shared_buffer,
+                alloc_graph_output=not shared_buffer,
+            ),
+        )
     )
     with open(f"{file_name}.pte", "wb") as file:
         file.write(exec_prog.buffer)
@@ -338,6 +355,13 @@ def setup_common_args_and_variables():
         type=str,
     )
 
+    parser.add_argument(
+        "-z",
+        "--shared_buffer",
+        help="Enables usage of shared buffer between application and backend for graph I/O.",
+        action="store_true",
+    )
+
     # QNN_SDK_ROOT might also be an argument, but it is used in various places.
     # So maybe it's fine to just use the environment.
     if "QNN_SDK_ROOT" not in os.environ: