From e80b2a24bed4030fbda54de8995131d3fc317946 Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Tue, 5 Nov 2024 16:33:09 +0800
Subject: [PATCH 1/3] Qualcomm AI Engine Direct - enable multiple graphs in
 single pte

Summary:
- support multiple graphs in single qnn context in runtime
- helper function in aot for generating multi-method pte
- enable weight sharing mechanism on HTP
- support file signature for cache reuse
- changes that making sure everything works as usual
- test cases
---
 backends/qualcomm/CMakeLists.txt              |   5 +-
 .../qualcomm/_passes/remove_redundancy.py     |   1 +
 backends/qualcomm/aot/ir/qcir.fbs             |   7 +-
 backends/qualcomm/aot/ir/qcir_utils.cpp       |   8 +-
 .../aot/python/PyQnnManagerAdaptor.cpp        |  14 +-
 .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 196 +++++++++++++--
 .../qualcomm/partition/qnn_partitioner.py     |   9 +-
 backends/qualcomm/partition/utils.py          |  22 ++
 backends/qualcomm/qnn_preprocess.py           |  10 +-
 backends/qualcomm/runtime/Logging.h           |   2 +-
 .../qualcomm/runtime/QnnExecuTorchBackend.cpp | 179 +++++--------
 .../qualcomm/runtime/QnnExecuTorchBackend.h   |  15 ++
 backends/qualcomm/runtime/QnnManager.cpp      | 235 +++++++++++++++---
 backends/qualcomm/runtime/QnnManager.h        |  47 +++-
 .../qualcomm/runtime/backends/CMakeLists.txt  |   1 +
 .../runtime/backends/QnnBackendCache.cpp      | 115 +++++----
 .../runtime/backends/QnnBackendCache.h        |  32 ++-
 .../runtime/backends/QnnBackendFactory.cpp    |   4 +-
 .../runtime/backends/QnnBackendFactory.h      |   2 +-
 .../runtime/backends/QnnContextCommon.cpp     |  22 +-
 .../runtime/backends/QnnContextCommon.h       |  18 +-
 .../runtime/backends/QnnDeviceCommon.h        |   2 +-
 .../runtime/backends/QnnGraphCommon.cpp       |  39 ++-
 .../runtime/backends/QnnGraphCommon.h         |  41 +--
 .../qualcomm/runtime/backends/QnnLogger.h     |   2 +-
 .../backends/htpbackend/HtpBackendCache.h     |   6 +-
 .../htpbackend/HtpContextCustomConfig.h       |   2 +-
 .../htpbackend/HtpDeviceCustomConfig.h        |   2 +-
 .../htpbackend/HtpDevicePlatformInfoConfig.h  |   2 +-
 .../runtime/backends/htpbackend/HtpGraph.h    |   3 +-
 .../htpbackend/HtpGraphCustomConfig.cpp       |   7 +-
 .../htpbackend/HtpGraphCustomConfig.h         |   5 +-
 .../aarch64/HtpGraphCustomConfig.cpp          |  21 ++
 .../x86_64/HtpContextCustomConfig.cpp         |  13 +-
 .../x86_64/HtpGraphCustomConfig.cpp           |  21 ++
 .../qualcomm/serialization/qc_binary_info.fbs |  20 ++
 .../{schema.fbs => qc_compiler_spec.fbs}      |  17 +-
 ...nn_compile_spec_schema.py => qc_schema.py} |   8 +
 ...ec_serialize.py => qc_schema_serialize.py} |  42 ++--
 backends/qualcomm/serialization/targets.bzl   |   8 +-
 backends/qualcomm/targets.bzl                 |   4 +-
 backends/qualcomm/tests/models.py             |   4 +-
 backends/qualcomm/tests/test_qnn_delegate.py  | 121 +++++++--
 backends/qualcomm/tests/utils.py              |  44 ++--
 backends/qualcomm/utils/utils.py              | 198 +++++++++++----
 .../executor_runner/qnn_executor_runner.cpp   |  25 +-
 examples/qualcomm/oss_scripts/llama2/llama.py |   4 +-
 .../qualcomm/oss_scripts/llama3_2/llama.py    |   4 +-
 .../llama/llama2/qaihub_llama2_7b.py          |  40 +--
 .../llama/llama3/qaihub_llama3_8b.py          |  27 +-
 .../qaihub_scripts/llama/runner/io_memory.cpp |  13 +-
 .../qaihub_scripts/llama/runner/io_memory.h   |   1 +
 .../qaihub_scripts/llama/runner/runner.cpp    |  13 +-
 .../qaihub_scripts/llama/runner/runner.h      |   1 +
 .../qaihub_stable_diffusion.py                |  26 +-
 .../stable_diffusion/runner/runner.cpp        |  30 ++-
 .../stable_diffusion/runner/runner.h          |   1 +
 .../qualcomm/qaihub_scripts/utils/export.py   |  40 ++-
 .../qualcomm/qaihub_scripts/utils/utils.py    |  54 ++--
 examples/qualcomm/scripts/export_example.py   |   4 +-
 .../qualcomm/scripts/mobilebert_fine_tune.py  |   4 +-
 examples/qualcomm/utils.py                    |   7 +-
 62 files changed, 1279 insertions(+), 591 deletions(-)
 create mode 100644 backends/qualcomm/partition/utils.py
 create mode 100644 backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpGraphCustomConfig.cpp
 create mode 100644 backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpGraphCustomConfig.cpp
 create mode 100644 backends/qualcomm/serialization/qc_binary_info.fbs
 rename backends/qualcomm/serialization/{schema.fbs => qc_compiler_spec.fbs} (91%)
 rename backends/qualcomm/serialization/{qnn_compile_spec_schema.py => qc_schema.py} (95%)
 rename backends/qualcomm/serialization/{qnn_compile_spec_serialize.py => qc_schema_serialize.py} (50%)

diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index a73b4ba85da..1f92b2d8cfd 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -74,7 +74,10 @@ include_directories(
   ${EXECUTORCH_SOURCE_DIR}/third-party/flatbuffers/include
 )
 
-set(_qnn_schema__srcs backends/qualcomm/serialization/schema.fbs)
+set(_qnn_schema__srcs
+  backends/qualcomm/serialization/qc_compiler_spec.fbs
+  backends/qualcomm/serialization/qc_binary_info.fbs
+)
 set(_qnn_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include")
 # Paths to headers generated from the .fbs files.
 set(_qnn_schema__outputs)
diff --git a/backends/qualcomm/_passes/remove_redundancy.py b/backends/qualcomm/_passes/remove_redundancy.py
index c54596f6583..825b2584ca7 100644
--- a/backends/qualcomm/_passes/remove_redundancy.py
+++ b/backends/qualcomm/_passes/remove_redundancy.py
@@ -20,6 +20,7 @@ class RemoveRedundancy(ExportPass):
         exir_ops.edge.aten.clone.default,
         torch.ops.aten.alias.default,
         exir_ops.edge.aten.alias.default,
+        exir_ops.edge.aten.lift_fresh_copy.default,
     }
 
     def __init__(self):
diff --git a/backends/qualcomm/aot/ir/qcir.fbs b/backends/qualcomm/aot/ir/qcir.fbs
index 2d8b1f78fec..6c16a54e0db 100755
--- a/backends/qualcomm/aot/ir/qcir.fbs
+++ b/backends/qualcomm/aot/ir/qcir.fbs
@@ -94,8 +94,13 @@ table Operator {
 }
 
 table Graph {
+    name: string;
     nodes: [Operator];
     tensors: [Tensor];
 }
 
-root_type Graph;
+table Context {
+    graphs: [Graph];
+}
+
+root_type Context;
diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp
index 153604f8d9d..8cf024ba006 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.cpp
+++ b/backends/qualcomm/aot/ir/qcir_utils.cpp
@@ -161,9 +161,7 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
       }
     } break;
     default:
-      QNN_EXECUTORCH_LOG_WARN(
-          "QNN_QUANTIZATION_ENCODING_UNDEFINED detected: %s",
-          QNN_VER_PTR(tensor)->name);
+      // encodings are not required if lowering with floating point precision
       break;
   }
   return CreateQuantizeParamDirect(
@@ -229,9 +227,7 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
           const_cast<int32_t*>(param->offsets()->data());
     } break;
     default:
-      QNN_EXECUTORCH_LOG_WARN(
-          "qcir::QuantizeType::UNDEFINED detected: %s",
-          tensor->name()->c_str());
+      // encodings are not required if lowering with floating point precision
       break;
   }
   return p;
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
index f3f7b618c9d..9dc7f7159cb 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
@@ -30,16 +30,26 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
   py::class_<PyQnnManager, std::shared_ptr<PyQnnManager>>(m, "QnnManager")
       .def(py::init<const py::bytes&>())
       .def(py::init<const py::bytes&, const py::bytes&>())
+      .def(py::init<const py::bytes&, const py::list&>())
       .def("Init", &PyQnnManager::Init)
       .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
-      .def("Compile", &PyQnnManager::Compile)
+      .def("Compile", py::overload_cast<>(&PyQnnManager::Compile))
+      .def(
+          "Compile",
+          py::overload_cast<
+              const std::string&,
+              std::vector<std::shared_ptr<OpWrapper>>&>(&PyQnnManager::Compile))
       .def("Destroy", &PyQnnManager::Destroy)
       .def("IsAvailable", &PyQnnManager::IsAvailable)
       .def("IsTensorDump", &PyQnnManager::IsTensorDump)
       .def("AllocateTensor", &PyQnnManager::AllocateTensor)
       .def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
       .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs)
-      .def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize);
+      .def("GetGraphNames", &PyQnnManager::GetGraphNames)
+      .def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize)
+      .def(
+          "MakeBinaryInfo",
+          py::overload_cast<const py::bytes&>(&PyQnnManager::MakeBinaryInfo));
 }
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
index 5cfae78c353..55429f2b430 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -8,10 +8,11 @@
 #pragma once
 #include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
 #include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
+#include <executorch/backends/qualcomm/qc_binary_info_generated.h>
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/backends/qualcomm/runtime/Logging.h>
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
-#include <executorch/backends/qualcomm/schema_generated.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -35,6 +36,7 @@ class PyQnnManager {
     qnn_manager_ = std::make_shared<QnnManager>(
         qnn_executorch_options, qnn_executorch_context_binary_);
   }
+
   // used for loading context binary directly
   explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin)
       : qnn_executorch_option_ptr_(buffer) {
@@ -42,25 +44,119 @@ class PyQnnManager {
         qnn_executorch_option_ptr_.cast<std::string_view>().data());
 
     py::buffer_info info(py::buffer(ctx_bin).request());
-    qnn_executorch_context_binary_.buffer = static_cast<void*>(info.ptr);
+    qnn_executorch_context_binary_.buffer = info.ptr;
     qnn_executorch_context_binary_.nbytes = info.size * info.itemsize;
     qnn_manager_ = std::make_shared<QnnManager>(
         qnn_executorch_options, qnn_executorch_context_binary_);
   }
 
+  // used for loading multiple graphs in qcir
+  explicit PyQnnManager(const py::bytes& buffer, const py::list& qcirs)
+      : qnn_executorch_option_ptr_(buffer) {
+    auto qnn_executorch_options = GetQnnExecuTorchOptions(
+        qnn_executorch_option_ptr_.cast<std::string_view>().data());
+
+    // merge multiple qcirs into one context with multiple graphs
+    std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
+    for (size_t i = 0; i < qcirs.size(); ++i) {
+      py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
+      flatbuffers::Verifier verifier_binary_info(
+          static_cast<const uint8_t* const>(info.ptr),
+          info.size * info.itemsize);
+      if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
+        QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
+        return;
+      }
+      auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr);
+
+      flatbuffers::Verifier verifier_qcir(
+          binary_info->data()->data(), binary_info->data()->size());
+      if (!qcir::VerifyContextBuffer(verifier_qcir)) {
+        QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format");
+        return;
+      }
+      auto context = qcir::GetContext(binary_info->data()->data());
+      for (const auto& graph : *context->graphs()) {
+        std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
+        for (const auto tensor : *graph->tensors()) {
+          // here we need to take a detour to merge multiple qcir flatbuffers
+          // outer ToTensor
+          //   return: flatbuffers::Offset<Tensor>
+          //   consume: QnnTensor, flatbuffers::FlatBufferBuilder*
+          // inner ToTensor
+          //   return: QnnTensor
+          //   consume: flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>
+          tensors.emplace_back(ToTensor(ToTensor(tensor), &builder_));
+        }
+        std::vector<flatbuffers::Offset<qcir::Operator>> nodes;
+        for (const auto& node : *graph->nodes()) {
+          int32_t* inputs_ptr = const_cast<int32_t*>(node->inputs()->data());
+          int32_t* outputs_ptr = const_cast<int32_t*>(node->outputs()->data());
+          int32_t* params_ptr = const_cast<int32_t*>(node->params()->data());
+          std::vector<int32_t> inputs(
+              inputs_ptr, inputs_ptr + node->inputs()->size());
+          std::vector<int32_t> outputs(
+              outputs_ptr, outputs_ptr + node->outputs()->size());
+          std::vector<int32_t> params(
+              params_ptr, params_ptr + node->params()->size());
+          nodes.emplace_back(qcir::CreateOperatorDirect(
+              builder_,
+              node->name()->str().c_str(),
+              node->package_name()->str().c_str(),
+              node->type_name()->str().c_str(),
+              &inputs,
+              &outputs,
+              &params));
+        }
+        graphs.emplace_back(qcir::CreateGraphDirect(
+            builder_, graph->name()->str().c_str(), &nodes, &tensors));
+      }
+    }
+
+    auto context = qcir::CreateContextDirect(builder_, &graphs);
+    builder_.Finish(context);
+    QnnExecuTorchContextBinary qcir_bin(
+        {builder_.GetBufferPointer(), builder_.GetSize()});
+
+    qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin);
+    qnn_manager_ = std::make_shared<QnnManager>(
+        qnn_executorch_options, qnn_executorch_context_binary_);
+  }
+
   executorch::runtime::Error Init() {
     return qnn_manager_->Init();
   }
+
   bool IsNodeSupportedByBackend(
       std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
     return qnn_manager_->IsNodeSupportedByBackend(op_wrappers);
   }
+
+  // this method is specific for compiling multi-graphs
+  py::array_t<char> Compile() {
+    if (qnn_manager_->CompileQcir() != Error::Ok) {
+      QNN_EXECUTORCH_LOG_ERROR("Fail to compile qcir");
+      return py::array_t<char>(0);
+    }
+
+    // generate context binary if compilation succeded
+    QnnExecuTorchContextBinary binary_info;
+    qnn_manager_->GetContextBinary(binary_info);
+    // allocate py::array (to pass the result of the C++ function to Python)
+    auto result = py::array_t<char>(binary_info.nbytes);
+    auto result_buffer = result.request();
+    char* result_ptr = (char*)result_buffer.ptr;
+    std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes);
+    return result;
+  }
+
   py::array_t<char> Compile(
+      const std::string& graph_name,
       std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
-    QnnExecuTorchContextBinary context_binary;
-    flatbuffers::FlatBufferBuilder builder;
+    QnnExecuTorchContextBinary binary_info;
 
-    if (qnn_manager_->IsOnlinePrepare()) {
+    if (qnn_manager_->IsOnlinePrepare() || qnn_manager_->IsMultipleGraphs()) {
+      builder_.Reset();
       std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
       std::unordered_map<void*, int> tensor_map;
 
@@ -74,7 +170,7 @@ class PyQnnManager {
           tensor_map[wrapper.get()] = i;
           index.push_back(i);
           tensors.emplace_back(
-              ToTensor(wrapper->CloneTensorStruct(), &builder));
+              ToTensor(wrapper->CloneTensorStruct(), &builder_));
         }
       };
 
@@ -112,13 +208,13 @@ class PyQnnManager {
             QNN_VER_PTR(t)->clientBuf.dataSize =
                 GetDataTypeSize(QNN_VER_PTR(t)->dataType);
             params.push_back(tensors.size());
-            tensors.emplace_back(ToTensor(t, &builder));
+            tensors.emplace_back(ToTensor(t, &builder_));
           }
         }
 
         Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig();
         operators.emplace_back(qcir::CreateOperatorDirect(
-            builder,
+            builder_,
             QNN_VER_PTR(op_config)->name,
             QNN_VER_PTR(op_config)->packageName,
             QNN_VER_PTR(op_config)->typeName,
@@ -126,24 +222,34 @@ class PyQnnManager {
             &outputs,
             &params));
       }
-      auto graph = qcir::CreateGraphDirect(builder, &operators, &tensors);
-      builder.Finish(graph);
-      context_binary.buffer = builder.GetBufferPointer();
-      context_binary.nbytes = builder.GetSize();
-    } else if (
-        qnn_manager_->Compile(op_wrappers, context_binary) !=
-        executorch::runtime::Error::Ok) {
-      return py::array_t<char>(0);
+      auto graph = qcir::CreateGraphDirect(
+          builder_, graph_name.c_str(), &operators, &tensors);
+      std::vector<flatbuffers::Offset<qcir::Graph>> graphs({graph});
+      auto context = qcir::CreateContextDirect(builder_, &graphs);
+      builder_.Finish(context);
+      QnnExecuTorchContextBinary qcir_binary(
+          {builder_.GetBufferPointer(), builder_.GetSize()});
+      binary_info = MakeBinaryInfo(qcir_binary);
+    } else {
+      if (qnn_manager_->Compile(graph_name, op_wrappers) !=
+          executorch::runtime::Error::Ok) {
+        QNN_EXECUTORCH_LOG_ERROR("Fail to compile QNN graph");
+        return py::array_t<char>(0);
+      }
+      if (qnn_manager_->GetContextBinary(binary_info) !=
+          executorch::runtime::Error::Ok) {
+        return py::array_t<char>(0);
+      }
     }
 
-    // allocate py::array (to pass the result of the C++ function to
-    // Python)
-    auto result = py::array_t<char>(context_binary.nbytes);
+    // allocate py::array (to pass the result of the C++ function to Python)
+    auto result = py::array_t<char>(binary_info.nbytes);
     auto result_buffer = result.request();
     char* result_ptr = (char*)result_buffer.ptr;
-    std::memcpy(result_ptr, context_binary.buffer, context_binary.nbytes);
+    std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes);
     return result;
   }
+
   void Destroy() {
     return qnn_manager_->Destroy();
   }
@@ -156,38 +262,76 @@ class PyQnnManager {
     return qnn_manager_->IsTensorDump();
   }
 
-  executorch::runtime::Error AllocateTensor() {
-    return qnn_manager_->AllocateTensor();
+  executorch::runtime::Error AllocateTensor(const std::string& graph_name) {
+    return qnn_manager_->AllocateTensor(graph_name);
   }
 
-  py::list GetGraphInputs() {
+  py::list GetGraphInputs(const std::string& graph_name) {
     py::list ret;
     for (const std::shared_ptr<TensorWrapper>& input :
-         qnn_manager_->GetGraphInputs()) {
+         qnn_manager_->GetGraphInputs(graph_name)) {
       ret.append(PyQnnTensorWrapper(input));
     }
     return ret;
   }
 
-  py::list GetGraphOutputs() {
+  py::list GetGraphOutputs(const std::string& graph_name) {
     py::list ret;
     for (const std::shared_ptr<TensorWrapper>& output :
-         qnn_manager_->GetGraphOutputs()) {
+         qnn_manager_->GetGraphOutputs(graph_name)) {
       ret.append(PyQnnTensorWrapper(output));
     }
     return ret;
   }
 
+  py::list GetGraphNames() {
+    py::list ret;
+    for (const std::string& graph_name : qnn_manager_->GetGraphNames()) {
+      ret.append(graph_name);
+    }
+    return ret;
+  }
+
   uint64_t GetSpillFillBufferSize() {
     return qnn_manager_->GetSpillFillBufferSize();
   }
 
+  py::array_t<char> MakeBinaryInfo(const py::bytes& ctx_bin) {
+    py::buffer_info info(py::buffer(ctx_bin).request());
+    QnnExecuTorchContextBinary binary(
+        {info.ptr, static_cast<uint64_t>(info.size * info.itemsize)});
+    auto binary_info = MakeBinaryInfo(binary);
+    auto result = py::array_t<char>(binary_info.nbytes);
+    auto result_buffer = result.request();
+    std::memcpy(result_buffer.ptr, binary_info.buffer, binary_info.nbytes);
+    return result;
+  }
+
  private:
+  QnnExecuTorchContextBinary MakeBinaryInfo(
+      const QnnExecuTorchContextBinary& ctx_bin) {
+    auto signature = []() {
+      return std::to_string(
+          std::chrono::high_resolution_clock::now().time_since_epoch().count());
+    };
+    const uint8_t* base = static_cast<uint8_t*>(ctx_bin.buffer);
+    std::vector<uint8_t> data(base, base + ctx_bin.nbytes);
+    // add signature to binary for cache reuse in runtime
+    builder_.Reset();
+    auto binary_info = qnn_delegate::CreateBinaryInfoDirect(
+        builder_, signature().c_str(), &data);
+    builder_.Finish(binary_info);
+
+    return QnnExecuTorchContextBinary(
+        {builder_.GetBufferPointer(), builder_.GetSize()});
+  }
+
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
   const py::bytes qnn_executorch_option_ptr_;
   QnnExecuTorchContextBinary qnn_executorch_context_binary_;
   std::shared_ptr<QnnManager> qnn_manager_;
+  flatbuffers::FlatBufferBuilder builder_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py
index 659bda517f0..05054cc5d8c 100644
--- a/backends/qualcomm/partition/qnn_partitioner.py
+++ b/backends/qualcomm/partition/qnn_partitioner.py
@@ -10,9 +10,9 @@
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
 import torch
 from executorch.backends.qualcomm.builders import node_visitor
+from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader
 from executorch.backends.qualcomm.qnn_preprocess import QnnBackend
 from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER
-from executorch.backends.qualcomm.utils.utils import generate_qnn_executorch_option
 
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
@@ -32,6 +32,7 @@
     not_supported_operator,
     to_be_implemented_operator,
 )
+from .utils import generate_qnn_executorch_option
 
 
 class QnnOperatorSupport(OperatorSupportBase):
@@ -63,7 +64,11 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
             )
             return False
 
-        if node.target in allow_list_operator:
+        if (
+            node.target in allow_list_operator
+            # bypass if custom op appears
+            or OpContextLoader.namespace == node.target.namespace
+        ):
             return True
 
         if (
diff --git a/backends/qualcomm/partition/utils.py b/backends/qualcomm/partition/utils.py
new file mode 100644
index 00000000000..88b922d4e1f
--- /dev/null
+++ b/backends/qualcomm/partition/utils.py
@@ -0,0 +1,22 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List
+
+from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC
+
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+
+def generate_qnn_executorch_option(
+    compiler_specs: List[CompileSpec],
+) -> bytes:
+    for compiler_spec in compiler_specs:
+        if compiler_spec.key == QCOM_QNN_COMPILE_SPEC:
+            qnn_compile_spec_buffer = compiler_spec.value
+        else:
+            raise ValueError(f"unknown compiler spec key value: {compiler_spec.key}")
+    return qnn_compile_spec_buffer
diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py
index f13d3fb55ae..0575137cbc3 100644
--- a/backends/qualcomm/qnn_preprocess.py
+++ b/backends/qualcomm/qnn_preprocess.py
@@ -19,7 +19,7 @@
 from executorch.backends.qualcomm._passes.layout_transform import LayoutTransform
 from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors
 from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader
-from executorch.backends.qualcomm.utils.utils import generate_qnn_executorch_option
+from executorch.backends.qualcomm.partition.utils import generate_qnn_executorch_option
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     CompileSpec,
@@ -83,7 +83,7 @@ def preprocess(
                     )
                     try:
                         context_loader_target = eval(
-                            f"torch.ops.{OpContextLoader.namespace}.{node.name}.default",
+                            f"torch.ops.{OpContextLoader.namespace}.{node.target.__name__}",
                             globals().update(torch.__dict__),
                         )
                         assert node.target == context_loader_target, err_msg
@@ -104,11 +104,13 @@ def preprocess(
             else:
                 raise RuntimeError(f"{node.op} is not supported in Qnn")
         qnn_context_binary = qnn_manager.Compile(
-            [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrapper_list]
+            qnn_manager.GetGraphNames()[0],
+            [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrapper_list],
         )
         assert len(qnn_context_binary) != 0, "Failed to generate Qnn context binary."
         qnn_manager.Destroy()
         # For now, debug_handle_map is not used by QNN ExecuTorch
         return PreprocessResult(
-            processed_bytes=bytes(qnn_context_binary), debug_handle_map={}
+            processed_bytes=bytes(qnn_context_binary),
+            debug_handle_map={},
         )
diff --git a/backends/qualcomm/runtime/Logging.h b/backends/qualcomm/runtime/Logging.h
index 66705de2ac3..8c0843afab9 100644
--- a/backends/qualcomm/runtime/Logging.h
+++ b/backends/qualcomm/runtime/Logging.h
@@ -7,7 +7,7 @@
  */
 #pragma once
 
-#include <executorch/backends/qualcomm/schema_generated.h>
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/runtime/core/error.h>
 namespace executorch {
 namespace backends {
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index 5a55df6da3f..ace6d5ee50c 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -6,14 +6,15 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
 #include <executorch/backends/qualcomm/aot/wrappers/TensorWrapper.h>
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorchBackend.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
-#include <executorch/backends/qualcomm/schema_generated.h>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
+
 using namespace qnn_delegate;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::BackendExecutionContext;
@@ -24,6 +25,7 @@ using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
+
 // ========== Public method implementations =========================
 constexpr const char* QNN_COMPILE_SPEC = "qnn_compile_spec";
 Result<DelegateHandle*> QnnExecuTorchBackend::init(
@@ -45,6 +47,7 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
     else
       QNN_EXECUTORCH_LOG_WARN("unknown argument: %s", compile_spec.key);
   }
+
   // Create QnnManager
   MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
   QnnManager* qnn_manager =
@@ -54,130 +57,39 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
   // destructible, we must call the destructor manually in destroy().
   new (qnn_manager) QnnManager(qnn_executorch_options, qnn_context_blob);
 
+  // TODO: this is a temporal solution for multi-graph support, will be
+  //       removed once framework starts to accept runtime configuration
+  // ---
+  // check if current context binary has already been initialized
+  // return cached one for reducing memory footprint
+  std::string signature = qnn_manager->GetBinarySignature();
+  auto iter = delegate_map_.find(signature);
+  if (iter != delegate_map_.end()) {
+    QNN_EXECUTORCH_LOG_INFO(
+        "Use cached delegate handle for current method: %s",
+        context.get_method_name());
+    return iter->second;
+  }
+
   ET_CHECK_OR_RETURN_ERROR(
       qnn_manager->Init() == Error::Ok,
       Internal,
       "Fail to initialize Qnn Manager");
 
   if (qnn_manager->IsOnlinePrepare()) {
-    auto graph = qcir::GetGraph(qnn_context_blob.buffer);
-    // qcir tensors to TensorWrapper
-    std::vector<std::shared_ptr<TensorWrapper>> tensors, graph_inputs,
-        graph_outputs;
-    for (const auto& tensor : *graph->tensors()) {
-      tensors.emplace_back(CreateTensorWrapper(ToTensor(tensor)));
-      if (tensor->type() == qcir::TensorType::WRITE) {
-        graph_inputs.push_back(tensors.back());
-      } else if (tensor->type() == qcir::TensorType::READ) {
-        graph_outputs.push_back(tensors.back());
-      }
-    }
-
-    std::vector<std::shared_ptr<OpWrapper>> op_wrappers;
-    // qcir graph node to OpWrapper
-    for (const auto& node : *graph->nodes()) {
-      std::shared_ptr<OpWrapper> op = std::make_shared<OpWrapper>(
-          node->name()->str(),
-          node->package_name()->str(),
-          node->type_name()->str());
-
-      // qcir input tensors to OpWrapper input tensors
-      std::vector<std::shared_ptr<TensorWrapper>> inputs;
-      for (uint32_t index : *node->inputs()) {
-        inputs.push_back(tensors[index]);
-      }
-      op->AddInputTensors(inputs);
-
-      // qcir output tensors to OpWrapper output tensors
-      std::vector<std::shared_ptr<TensorWrapper>> outputs;
-      for (uint32_t index : *node->outputs()) {
-        outputs.push_back(tensors[index]);
-      }
-      op->AddOutputTensors(outputs);
-
-      // qcir operator param to OpWrapper param
-      for (uint32_t index : *node->params()) {
-        const auto& tensor = graph->tensors()->Get(index);
-        std::string name = tensor->name()->str();
-        Qnn_DataType_t dtype = ToDataType(tensor->dtype());
-        if (tensor->shape()->size() != 0) {
-          // add tensor param
-          op->AddTensorParam(
-              name,
-              dtype,
-              tensor->shape()->size(),
-              tensor->shape()->data(),
-              tensor->data()->data());
-        } else {
-          // add scalar param
-          switch (dtype) {
-            case Qnn_DataType_t::QNN_DATATYPE_INT_32:
-              op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const int32_t*>(tensor->data()->Data()));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_INT_16:
-              op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const int16_t*>(tensor->data()->Data()));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_INT_8:
-              op->AddScalarParam(
-                  name, dtype, static_cast<int8_t>(*tensor->data()->Data()));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_UINT_32:
-              op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const uint32_t*>(tensor->data()->Data()));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_UINT_16:
-              op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const uint16_t*>(tensor->data()->Data()));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_UINT_8:
-              op->AddScalarParam(name, dtype, *tensor->data()->Data());
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_FLOAT_32:
-            case Qnn_DataType_t::QNN_DATATYPE_FLOAT_16:
-              op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const float*>(tensor->data()->Data()));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_BOOL_8:
-              op->AddScalarParam(name, dtype, *tensor->data()->Data());
-              break;
-            default:
-              QNN_EXECUTORCH_LOG_ERROR(
-                  "Invalid scalar type: %s", tensor->name()->c_str());
-              break;
-          }
-        }
-      }
-      op_wrappers.push_back(std::move(op));
-    }
-
-    QnnExecuTorchContextBinary context_binary;
-    ET_CHECK_OR_RETURN_ERROR(
-        qnn_manager->Compile(op_wrappers, context_binary) == Error::Ok,
-        Internal,
-        "Fail to compile graph in online prepare stage");
-
     ET_CHECK_OR_RETURN_ERROR(
-        qnn_manager->AllocateTensor(graph_inputs, graph_outputs) == Error::Ok,
+        qnn_manager->CompileQcir() == Error::Ok,
         Internal,
-        "Fail to allocate tensor in online prepare stage");
+        "Fail to compile binary in qcir format");
   } else {
-    ET_CHECK_OR_RETURN_ERROR(
-        qnn_manager->AllocateTensor() == Error::Ok,
-        Internal,
-        "Fail to allocate tensor");
+    for (const std::string& graph_name : qnn_manager->GetGraphNames()) {
+      ET_CHECK_OR_RETURN_ERROR(
+          qnn_manager->AllocateTensor(graph_name) == Error::Ok,
+          Internal,
+          "Fail to allocate tensor");
+    }
   }
+  add_cached_delegate(signature, qnn_manager);
   return qnn_manager;
 }
 
@@ -185,12 +97,17 @@ Error QnnExecuTorchBackend::execute(
     BackendExecutionContext& context,
     DelegateHandle* handle,
     EValue** args) const {
+  ET_CHECK_OR_RETURN_ERROR(
+      delegate_map_rev_.count(handle) != 0,
+      Internal,
+      "DelegateHandle has been deleted");
   QnnManager* qnn_manager = static_cast<QnnManager*>(handle);
 
+  std::string method_name = context.get_method_name();
   std::vector<std::shared_ptr<TensorWrapper>> input_tensors =
-      qnn_manager->GetGraphInputs();
+      qnn_manager->GetGraphInputs(method_name);
   std::vector<std::shared_ptr<TensorWrapper>> output_tensors =
-      qnn_manager->GetGraphOutputs();
+      qnn_manager->GetGraphOutputs(method_name);
   std::vector<Qnn_Tensor_t> input_tensor_structs;
   std::vector<Qnn_Tensor_t> output_tensor_structs;
 
@@ -223,13 +140,15 @@ Error QnnExecuTorchBackend::execute(
 
   ET_CHECK_OR_RETURN_ERROR(
       qnn_manager->Execute(
+          method_name,
           input_tensor_structs,
           output_tensor_structs,
           context.event_tracer()) == Error::Ok,
       Internal,
       "Fail to execute graph");
   ET_CHECK_OR_RETURN_ERROR(
-      qnn_manager->ProfileExecuteData(context.event_tracer()) == Error::Ok,
+      qnn_manager->ProfileExecuteData(method_name, context.event_tracer()) ==
+          Error::Ok,
       Internal,
       "Fail to profile graph");
 
@@ -237,9 +156,10 @@ Error QnnExecuTorchBackend::execute(
 }
 
 void QnnExecuTorchBackend::destroy(DelegateHandle* handle) const {
-  if (handle != nullptr) {
+  if (handle != nullptr && delegate_map_rev_.count(handle)) {
     QnnManager* qnn_manager = static_cast<QnnManager*>(handle);
     qnn_manager->Destroy();
+    erase_cached_delegate(handle);
   }
 }
 
@@ -247,6 +167,25 @@ bool QnnExecuTorchBackend::is_available() const {
   return true;
 }
 
+void QnnExecuTorchBackend::add_cached_delegate(
+    const std::string& signature,
+    executorch::runtime::DelegateHandle* handle) const {
+  std::lock_guard<std::mutex> guard(mutex_);
+  delegate_map_[signature] = handle;
+  delegate_map_rev_[handle] = signature;
+}
+
+void QnnExecuTorchBackend::erase_cached_delegate(
+    executorch::runtime::DelegateHandle* handle) const {
+  std::lock_guard<std::mutex> guard(mutex_);
+  auto iter = delegate_map_rev_.find(handle);
+  if (iter == delegate_map_rev_.end()) {
+    return;
+  }
+  delegate_map_.erase(iter->second);
+  delegate_map_rev_.erase(handle);
+}
+
 namespace {
 auto cls = QnnExecuTorchBackend();
 executorch::runtime::Backend backend{"QnnBackend", &cls};
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.h b/backends/qualcomm/runtime/QnnExecuTorchBackend.h
index 70677b0009b..630067da48a 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.h
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.h
@@ -11,6 +11,9 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 
+#include <mutex>
+#include <unordered_map>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
@@ -34,6 +37,18 @@ class QnnExecuTorchBackend final
   void destroy(executorch::runtime::DelegateHandle* handle) const override;
 
   bool is_available() const override;
+
+ private:
+  void add_cached_delegate(
+      const std::string& signature,
+      executorch::runtime::DelegateHandle* handle) const;
+  void erase_cached_delegate(executorch::runtime::DelegateHandle* handle) const;
+
+  mutable std::mutex mutex_;
+  mutable std::unordered_map<std::string, executorch::runtime::DelegateHandle*>
+      delegate_map_;
+  mutable std::unordered_map<executorch::runtime::DelegateHandle*, std::string>
+      delegate_map_rev_;
 };
 
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 9eeb6a8a016..a4d83585f28 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -5,6 +5,9 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+
+#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
+#include <executorch/backends/qualcomm/qc_binary_info_generated.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
 #include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
 #include <executorch/backends/qualcomm/runtime/Utils.h>
@@ -301,10 +304,14 @@ Error QnnManager::Init() {
         backend_params_ptr_->qnn_context_ptr_->Configure() == Error::Ok,
         Internal,
         "Fail to configure Qnn context");
-    ET_CHECK_OR_RETURN_ERROR(
-        backend_params_ptr_->qnn_graph_ptr_->Configure() == Error::Ok,
-        Internal,
-        "Fail to configure Qnn graph");
+    for (const std::string& graph_name :
+         backend_params_ptr_->qnn_context_ptr_->GetGraphNames()) {
+      ET_CHECK_OR_RETURN_ERROR(
+          backend_params_ptr_->qnn_graph_ptr_->Configure(graph_name) ==
+              Error::Ok,
+          Internal,
+          "Fail to configure Qnn graph");
+    }
     backend_params_ptr_->backend_init_state_ =
         BackendInitializeState::INITIALIZED;
   }
@@ -318,20 +325,22 @@ Error QnnManager::Init() {
   return Error::Ok;
 }
 
-Error QnnManager::AllocateTensor() {
+Error QnnManager::AllocateTensor(const std::string& graph_name) {
   std::vector<Qnn_Tensor_t> input_tensors =
-      backend_params_ptr_->qnn_context_ptr_->GetGraphInputs();
+      backend_params_ptr_->qnn_context_ptr_->GetGraphInputs(graph_name);
   std::vector<Qnn_Tensor_t> output_tensors =
-      backend_params_ptr_->qnn_context_ptr_->GetGraphOutputs();
+      backend_params_ptr_->qnn_context_ptr_->GetGraphOutputs(graph_name);
 
   for (auto& tensor : input_tensors) {
     std::shared_ptr<TensorWrapper> tensor_wrapper = CreateTensorWrapper(tensor);
     tensor_wrapper->UpdateQnnTensorMeta(tensor);
-    input_tensors_.emplace_back(std::move(tensor_wrapper));
+    input_tensors_[graph_name].emplace_back(std::move(tensor_wrapper));
   }
   if (!options_->is_from_context_binary()) {
     std::sort(
-        input_tensors_.begin(), input_tensors_.end(), CompareExportedInput);
+        input_tensors_[graph_name].begin(),
+        input_tensors_[graph_name].end(),
+        CompareExportedInput);
   }
   for (size_t i = 0; i < output_tensors.size(); ++i) {
     std::shared_ptr<TensorWrapper> tensor_wrapper =
@@ -347,36 +356,37 @@ Error QnnManager::AllocateTensor() {
     if (IsTensorDump()) {
       tensor_wrapper->AllocateDataBuffer();
     }
-    output_tensors_.emplace_back(std::move(tensor_wrapper));
+    output_tensors_[graph_name].emplace_back(std::move(tensor_wrapper));
   }
   return Error::Ok;
 }
 
 Error QnnManager::AllocateTensor(
+    const std::string& graph_name,
     std::vector<std::shared_ptr<TensorWrapper>>& inputs,
     std::vector<std::shared_ptr<TensorWrapper>>& outputs) {
-  input_tensors_ = std::move(inputs);
-  for (auto& output_tensor : outputs) {
-    if (IsTensorDump()) {
-      output_tensor->AllocateDataBuffer();
-    }
-  }
+  input_tensors_[graph_name] = std::move(inputs);
+  // TODO: suuport per-tensor dump in online prepare mode
+  //       should be achievable with some pre-process
   if (!options_->is_from_context_binary()) {
     std::sort(
-        input_tensors_.begin(), input_tensors_.end(), CompareExportedInput);
+        input_tensors_[graph_name].begin(),
+        input_tensors_[graph_name].end(),
+        CompareExportedInput);
   }
-  output_tensors_ = std::move(outputs);
+  output_tensors_[graph_name] = std::move(outputs);
   return Error::Ok;
 }
 
 Error QnnManager::Execute(
+    const std::string& graph_name,
     const std::vector<Qnn_Tensor_t>& input_tensor_structs,
     std::vector<Qnn_Tensor_t>& output_tensor_structs,
     executorch::runtime::EventTracer* event_tracer) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
   error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute(
-      input_tensor_structs, output_tensor_structs);
+      graph_name, input_tensor_structs, output_tensor_structs);
 
   if (error != QNN_SUCCESS) {
     QNN_EXECUTORCH_LOG_ERROR(
@@ -413,11 +423,12 @@ Error QnnManager::Execute(
 }
 
 Error QnnManager::ProfileExecuteData(
+    const std::string& graph_name,
     executorch::runtime::EventTracer* event_tracer) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
   if (options_->profile_level() != QnnExecuTorchProfileLevel::kProfileOff) {
-    error =
-        backend_params_ptr_->qnn_graph_ptr_->ProfileExecuteData(event_tracer);
+    error = backend_params_ptr_->qnn_graph_ptr_->ProfileExecuteData(
+        graph_name, event_tracer);
     if (error != QNN_SUCCESS) {
       QNN_EXECUTORCH_LOG_ERROR(
           " Failed to profile. Error %d", QNN_GET_ERROR_CODE(error));
@@ -465,16 +476,163 @@ bool QnnManager::IsNodeSupportedByBackend(
   return true;
 }
 
-Error QnnManager::Compile(
-    std::vector<std::shared_ptr<OpWrapper>>& op_wrappers,
+Error QnnManager::GetContextBinary(
     QnnExecuTorchContextBinary& qnn_executorch_context_binary) {
+  ET_CHECK_OR_RETURN_ERROR(
+      backend_params_ptr_->qnn_context_ptr_->GetContextBinary(
+          qnn_executorch_context_binary) == Error::Ok,
+      Internal,
+      "Fail to get context binary.");
+
+  return Error::Ok;
+}
+
+Error QnnManager::CompileQcir() {
+  flatbuffers::Verifier verifier_binary_info(
+      static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
+      qnn_context_blob_.nbytes);
+  if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
+    QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
+    return Error::Internal;
+  }
+
+  auto binary_info = qnn_delegate::GetBinaryInfo(qnn_context_blob_.buffer);
+  flatbuffers::Verifier verifier_qcir(
+      binary_info->data()->data(), binary_info->data()->size());
+  if (!qcir::VerifyContextBuffer(verifier_qcir)) {
+    QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format");
+    return Error::Internal;
+  }
+
+  auto context = qcir::GetContext(binary_info->data()->data());
+  for (const auto& graph : *context->graphs()) {
+    // qcir tensors to TensorWrapper
+    std::vector<std::shared_ptr<TensorWrapper>> graph_inputs, graph_outputs,
+        tensors;
+    for (const auto& tensor : *graph->tensors()) {
+      tensors.emplace_back(CreateTensorWrapper(ToTensor(tensor)));
+      if (tensor->type() == qcir::TensorType::WRITE) {
+        graph_inputs.push_back(tensors.back());
+      } else if (tensor->type() == qcir::TensorType::READ) {
+        graph_outputs.push_back(tensors.back());
+      }
+    }
+    std::vector<std::shared_ptr<OpWrapper>> op_wrappers;
+    // qcir graph node to OpWrapper
+    for (const auto& node : *graph->nodes()) {
+      std::shared_ptr<OpWrapper> op = std::make_shared<OpWrapper>(
+          node->name()->str(),
+          node->package_name()->str(),
+          node->type_name()->str());
+
+      // qcir input tensors to OpWrapper input tensors
+      std::vector<std::shared_ptr<TensorWrapper>> inputs;
+      for (uint32_t index : *node->inputs()) {
+        inputs.push_back(tensors[index]);
+      }
+      op->AddInputTensors(inputs);
+
+      // qcir output tensors to OpWrapper output tensors
+      std::vector<std::shared_ptr<TensorWrapper>> outputs;
+      for (uint32_t index : *node->outputs()) {
+        outputs.push_back(tensors[index]);
+      }
+      op->AddOutputTensors(outputs);
+
+      // qcir operator param to OpWrapper param
+      for (uint32_t index : *node->params()) {
+        const auto& tensor = graph->tensors()->Get(index);
+        std::string name = tensor->name()->str();
+        Qnn_DataType_t dtype = ToDataType(tensor->dtype());
+        if (tensor->shape()->size() != 0) {
+          // add tensor param
+          op->AddTensorParam(
+              name,
+              dtype,
+              tensor->shape()->size(),
+              tensor->shape()->data(),
+              tensor->data()->data());
+        } else {
+          // add scalar param
+          switch (dtype) {
+            case Qnn_DataType_t::QNN_DATATYPE_INT_32:
+              op->AddScalarParam(
+                  name,
+                  dtype,
+                  *reinterpret_cast<const int32_t*>(tensor->data()->Data()));
+              break;
+            case Qnn_DataType_t::QNN_DATATYPE_INT_16:
+              op->AddScalarParam(
+                  name,
+                  dtype,
+                  *reinterpret_cast<const int16_t*>(tensor->data()->Data()));
+              break;
+            case Qnn_DataType_t::QNN_DATATYPE_INT_8:
+              op->AddScalarParam(
+                  name, dtype, static_cast<int8_t>(*tensor->data()->Data()));
+              break;
+            case Qnn_DataType_t::QNN_DATATYPE_UINT_32:
+              op->AddScalarParam(
+                  name,
+                  dtype,
+                  *reinterpret_cast<const uint32_t*>(tensor->data()->Data()));
+              break;
+            case Qnn_DataType_t::QNN_DATATYPE_UINT_16:
+              op->AddScalarParam(
+                  name,
+                  dtype,
+                  *reinterpret_cast<const uint16_t*>(tensor->data()->Data()));
+              break;
+            case Qnn_DataType_t::QNN_DATATYPE_UINT_8:
+              op->AddScalarParam(name, dtype, *tensor->data()->Data());
+              break;
+            case Qnn_DataType_t::QNN_DATATYPE_FLOAT_32:
+            case Qnn_DataType_t::QNN_DATATYPE_FLOAT_16:
+              op->AddScalarParam(
+                  name,
+                  dtype,
+                  *reinterpret_cast<const float*>(tensor->data()->Data()));
+              break;
+            case Qnn_DataType_t::QNN_DATATYPE_BOOL_8:
+              op->AddScalarParam(name, dtype, *tensor->data()->Data());
+              break;
+            default:
+              QNN_EXECUTORCH_LOG_ERROR(
+                  "Invalid scalar type: %s", tensor->name()->c_str());
+              break;
+          }
+        }
+      }
+      op_wrappers.push_back(std::move(op));
+    }
+
+    ET_CHECK_OR_RETURN_ERROR(
+        Compile(graph->name()->str(), op_wrappers) == Error::Ok,
+        Internal,
+        "Fail to compile graph from qcir with graph_name: %s",
+        graph->name()->str().c_str());
+
+    ET_CHECK_OR_RETURN_ERROR(
+        AllocateTensor(graph->name()->str(), graph_inputs, graph_outputs) ==
+            Error::Ok,
+        Internal,
+        "Fail to allocate tensor for qcir with graph_name: %s",
+        graph->name()->str().c_str());
+  }
+
+  return Error::Ok;
+}
+
+Error QnnManager::Compile(
+    const std::string& graph_name,
+    std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
   for (std::shared_ptr<OpWrapper>& op_wrapper : op_wrappers) {
     for (const auto& tensor_wrapper : op_wrapper->GetInputTensors()) {
       ET_CHECK_OR_RETURN_ERROR(
           backend_params_ptr_->qnn_graph_ptr_->EnsureTensorInQnnGraph(
-              tensor_wrapper) == Error::Ok,
+              graph_name, tensor_wrapper) == Error::Ok,
           Internal,
           "Tensor name %s isn't added to Qnn Graph",
           tensor_wrapper->GetName().c_str());
@@ -483,7 +641,7 @@ Error QnnManager::Compile(
     for (const auto& tensor_wrapper : op_wrapper->GetOutputTensors()) {
       ET_CHECK_OR_RETURN_ERROR(
           backend_params_ptr_->qnn_graph_ptr_->EnsureTensorInQnnGraph(
-              tensor_wrapper) == Error::Ok,
+              graph_name, tensor_wrapper) == Error::Ok,
           Internal,
           "Tensor name %s isn't added to Qnn Graph",
           tensor_wrapper->GetName().c_str());
@@ -494,7 +652,7 @@ Error QnnManager::Compile(
       if (p_tensor_param != nullptr) {
         ET_CHECK_OR_RETURN_ERROR(
             backend_params_ptr_->qnn_graph_ptr_->EnsureTensorInQnnGraph(
-                p_tensor_param->GetTensorWrapper()) == Error::Ok,
+                graph_name, p_tensor_param->GetTensorWrapper()) == Error::Ok,
             Internal,
             "Param tensor name %s isn't added to Qnn Graph",
             p_tensor_param->GetName().c_str());
@@ -506,7 +664,7 @@ Error QnnManager::Compile(
     }
 
     error = backend_params_ptr_->qnn_graph_ptr_->GraphAddNode(
-        op_wrapper->GetOpConfig());
+        graph_name, op_wrapper->GetOpConfig());
     if (error != QNN_SUCCESS) {
       QNN_EXECUTORCH_LOG_ERROR(
           "Failed to add node to Qnn Graph with error: %d",
@@ -515,7 +673,7 @@ Error QnnManager::Compile(
     }
   }
 
-  error = backend_params_ptr_->qnn_graph_ptr_->GraphFinalize();
+  error = backend_params_ptr_->qnn_graph_ptr_->GraphFinalize(graph_name);
   if (error != QNN_SUCCESS) {
     QNN_EXECUTORCH_LOG_ERROR(
         "Failed to finalize Qnn Graph with error: %d",
@@ -523,17 +681,18 @@ Error QnnManager::Compile(
     return Error::Internal;
   }
 
-  // no need to generate extra context binary in online prepare scenario
-  if (!IsOnlinePrepare()) {
-    ET_CHECK_OR_RETURN_ERROR(
-        backend_params_ptr_->qnn_context_ptr_->GetContextBinary(
-            qnn_executorch_context_binary) == Error::Ok,
-        Internal,
-        "Fail to get context binary.");
-  }
-
   return Error::Ok;
-};
+}
+
+std::string QnnManager::GetBinarySignature() {
+  flatbuffers::Verifier verifier(
+      static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
+      qnn_context_blob_.nbytes);
+  return VerifyBinaryInfoBuffer(verifier)
+      ? GetBinaryInfo(qnn_context_blob_.buffer)->signature()->str()
+      : "";
+}
+
 } // namespace qnn
 } // namespace backends
 } // namespace executorch
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 2b0fc09a591..0157ee58378 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -9,10 +9,10 @@
 
 #include <executorch/backends/qualcomm/aot/wrappers/OpWrapper.h>
 #include <executorch/backends/qualcomm/aot/wrappers/TensorWrapper.h>
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/backends/qualcomm/runtime/Logging.h>
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendFactory.h>
-#include <executorch/backends/qualcomm/schema_generated.h>
 #include <executorch/runtime/core/error.h>
 
 #include <memory>
@@ -30,17 +30,20 @@ class QnnManager {
 
   ~QnnManager();
   executorch::runtime::Error Init();
-  executorch::runtime::Error AllocateTensor();
+  executorch::runtime::Error AllocateTensor(const std::string& graph_name);
   executorch::runtime::Error AllocateTensor(
+      const std::string& graph_name,
       std::vector<std::shared_ptr<TensorWrapper>>& inputs,
       std::vector<std::shared_ptr<TensorWrapper>>& outputs);
 
   executorch::runtime::Error Execute(
+      const std::string& graph_name,
       const std::vector<Qnn_Tensor_t>& input_tensor_structs,
       std::vector<Qnn_Tensor_t>& output_tensor_structs,
       executorch::runtime::EventTracer* event_tracer);
 
   executorch::runtime::Error ProfileExecuteData(
+      const std::string& graph_name,
       executorch::runtime::EventTracer* event_tracer);
 
   void Destroy();
@@ -53,6 +56,10 @@ class QnnManager {
     return options_->online_prepare();
   }
 
+  bool IsMultipleGraphs() {
+    return options_->multiple_graphs();
+  }
+
   bool IsTensorDump() {
     return options_->dump_intermediate_outputs();
   }
@@ -60,10 +67,15 @@ class QnnManager {
   bool IsNodeSupportedByBackend(
       std::vector<std::shared_ptr<OpWrapper>>& op_wrappers);
 
-  executorch::runtime::Error Compile(
-      std::vector<std::shared_ptr<OpWrapper>>& op_wrappers,
+  executorch::runtime::Error GetContextBinary(
       QnnExecuTorchContextBinary& qnn_executorch_context_binary);
 
+  executorch::runtime::Error CompileQcir();
+
+  executorch::runtime::Error Compile(
+      const std::string& graph_name,
+      std::vector<std::shared_ptr<OpWrapper>>& op_wrappers);
+
   executorch::runtime::Error RegisterMem(
       void* data_ptr,
       const std::shared_ptr<TensorWrapper>& tensor_wrapper);
@@ -77,13 +89,26 @@ class QnnManager {
     return htp_backend_cache_ptr->GetSpillFillBufferSize();
   }
 
-  std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs() {
-    return input_tensors_;
+  std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs(
+      const std::string& graph_name) {
+    return !input_tensors_.count(graph_name)
+        ? std::vector<std::shared_ptr<TensorWrapper>>()
+        : input_tensors_[graph_name];
   }
-  std::vector<std::shared_ptr<TensorWrapper>> GetGraphOutputs() {
-    return output_tensors_;
+
+  std::vector<std::shared_ptr<TensorWrapper>> GetGraphOutputs(
+      const std::string& graph_name) {
+    return !output_tensors_.count(graph_name)
+        ? std::vector<std::shared_ptr<TensorWrapper>>()
+        : output_tensors_[graph_name];
   }
 
+  std::vector<std::string> GetGraphNames() {
+    return backend_params_ptr_->qnn_context_ptr_->GetGraphNames();
+  }
+
+  std::string GetBinarySignature();
+
  private:
   executorch::runtime::Error LoadQnnLibrary();
 
@@ -96,8 +121,10 @@ class QnnManager {
   QnnImplementation qnn_loaded_backend_;
   std::unique_ptr<QnnLogger> logger_;
   const QnnExecuTorchOptions* options_;
-  std::vector<std::shared_ptr<TensorWrapper>> input_tensors_;
-  std::vector<std::shared_ptr<TensorWrapper>> output_tensors_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<TensorWrapper>>>
+      input_tensors_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<TensorWrapper>>>
+      output_tensors_;
   executorch::runtime::Error RegisterIonMem(
       void* data_ptr,
       const std::shared_ptr<TensorWrapper>& tensor_wrapper);
diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt
index 9147d4f32a9..2df806db52c 100644
--- a/backends/qualcomm/runtime/backends/CMakeLists.txt
+++ b/backends/qualcomm/runtime/backends/CMakeLists.txt
@@ -91,6 +91,7 @@ target_sources(
           ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpGraph.cpp
           ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpGraphCustomConfig.h
           ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpGraphCustomConfig.cpp
+          ${HOST_ARCHITECTURE}/HtpGraphCustomConfig.cpp
 )
 
 # qnn_backend
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
index 350c040b221..43cb835cfff 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -5,29 +5,30 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+
 #include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
+#include <executorch/backends/qualcomm/qc_binary_info_generated.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCache.h>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
 
 using executorch::runtime::Error;
 
-Error QnnBackendCache::GetQnnGraphInfoFromBinary() {
+Error QnnBackendCache::GetQnnGraphInfoFromBinary(
+    void* buffer,
+    uint32_t nbytes) {
   const QnnSystemInterface& qnn_sys_interface =
       qnn_sys_impl_.GetQnnSystemInterface();
   std::uint32_t num_graphs;
-  QnnSystemContext_GraphInfo_t* graph = nullptr;
+  QnnSystemContext_GraphInfo_t* graphs = nullptr;
   const QnnSystemContext_BinaryInfo_t* binaryinfo{nullptr};
   Qnn_ContextBinarySize_t binaryinfo_size = 0;
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
   error = qnn_sys_interface.qnn_system_context_get_binary_info(
-      sys_context_handle_,
-      qnn_context_blob_.buffer,
-      qnn_context_blob_.nbytes,
-      &binaryinfo,
-      &binaryinfo_size);
+      sys_context_handle_, buffer, nbytes, &binaryinfo, &binaryinfo_size);
 
   if (error != QNN_SUCCESS) {
     QNN_EXECUTORCH_LOG_WARN(
@@ -47,45 +48,26 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() {
 
   if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
     num_graphs = binaryinfo->contextBinaryInfoV1.numGraphs;
-    graph = binaryinfo->contextBinaryInfoV1.graphs;
+    graphs = binaryinfo->contextBinaryInfoV1.graphs;
   } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
     num_graphs = binaryinfo->contextBinaryInfoV2.numGraphs;
-    graph = binaryinfo->contextBinaryInfoV2.graphs;
+    graphs = binaryinfo->contextBinaryInfoV2.graphs;
   } else {
     QNN_EXECUTORCH_LOG_WARN(
         "Unknown QNN BinaryInfo version %d.", binaryinfo->version);
     return Error::Internal;
   }
 
-  if (num_graphs > 1) {
-    QNN_EXECUTORCH_LOG_WARN(
-        "The context binary contains %lu graphs. But now "
-        "assume that one context binary contains one graph.",
-        num_graphs);
-    return Error::Internal;
-  }
-
-  // only have version_1 now
-  if (graph[0].version != QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) {
-    QNN_EXECUTORCH_LOG_WARN(
-        "Unknown QNN GraphInfo version %d.", graph[0].version);
-    return Error::Internal;
-  }
-  // get graph name from metadata
-  graph_name_ = graph->graphInfoV1.graphName;
-
-  // get graph inputs from metadata
-  uint32_t numGraphInputs = graph->graphInfoV1.numGraphInputs;
-  input_tensor_structs_.reserve(numGraphInputs);
-  for (std::uint32_t i = 0; i < numGraphInputs; ++i) {
-    input_tensor_structs_.emplace_back(graph->graphInfoV1.graphInputs[i]);
-  }
-
-  // get graph outputs from metadata
-  uint32_t numGraphOutputs = graph->graphInfoV1.numGraphOutputs;
-  output_tensor_structs_.reserve(numGraphOutputs);
-  for (std::uint32_t i = 0; i < numGraphOutputs; ++i) {
-    output_tensor_structs_.emplace_back(graph->graphInfoV1.graphOutputs[i]);
+  for (std::uint32_t i = 0; i < num_graphs; ++i) {
+    if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) {
+      RetrieveGraphInfo<QnnSystemContext_GraphInfoV1_t>(graphs[i].graphInfoV1);
+    } else if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) {
+      RetrieveGraphInfo<QnnSystemContext_GraphInfoV2_t>(graphs[i].graphInfoV2);
+    } else {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Unknown QNN GraphInfo version %d.", binaryinfo->version);
+      return Error::Internal;
+    }
   }
 
   return Error::Ok;
@@ -94,6 +76,8 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() {
 Error QnnBackendCache::Configure() {
   if (qnn_context_blob_.buffer == nullptr) {
     state_ = SERIALIZE;
+    // use aot_graph_name if we're lowering graph on host side
+    graph_names_.push_back(aot_graph_name_);
     QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE.");
     return Error::Ok;
   }
@@ -123,16 +107,30 @@ Error QnnBackendCache::Configure() {
   // DO DESERIALIZE
   state_ = DESERIALIZE;
   QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in RESTORE MODE.");
-  Error status = GetQnnGraphInfoFromBinary();
+  flatbuffers::Verifier verifier_binary_info(
+      static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
+      qnn_context_blob_.nbytes);
+  if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
+    QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
+    return Error::Internal;
+  }
+
+  auto binary_info = GetBinaryInfo(qnn_context_blob_.buffer);
+  Error status = GetQnnGraphInfoFromBinary(
+      const_cast<uint8_t*>(binary_info->data()->data()),
+      binary_info->data()->size());
+
   if (status == Error::Internal) {
     // check if context binary came from flatbuffer
-    flatbuffers::FlatBufferBuilder builder;
     flatbuffers::Verifier verifier(
-        static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
-        qnn_context_blob_.nbytes);
+        binary_info->data()->data(), binary_info->data()->size());
 
-    if (qcir::VerifyGraphBuffer(verifier)) {
+    if (qcir::VerifyContextBuffer(verifier)) {
       state_ = ONLINE_PREPARE;
+      auto context = qcir::GetContext(binary_info->data()->data());
+      for (const auto& graph : *context->graphs()) {
+        graph_names_.emplace_back(graph->name()->str());
+      }
       return Error::Ok;
     }
 
@@ -159,19 +157,42 @@ QnnBackendCache::~QnnBackendCache() {
   qnn_sys_impl_.Unload();
 }
 
-std::vector<Qnn_Tensor_t> QnnBackendCache::GetGraphInputs() {
+std::vector<Qnn_Tensor_t> QnnBackendCache::GetGraphInputs(
+    const std::string& graph_name) {
   if (state_ != DESERIALIZE)
     return {};
 
-  return input_tensor_structs_;
+  return input_tensor_structs_[graph_name];
 }
 
-std::vector<Qnn_Tensor_t> QnnBackendCache::GetGraphOutputs() {
+std::vector<Qnn_Tensor_t> QnnBackendCache::GetGraphOutputs(
+    const std::string& graph_name) {
   if (state_ != DESERIALIZE)
     return {};
 
-  return output_tensor_structs_;
+  return output_tensor_structs_[graph_name];
+}
+
+template <typename INFO>
+void QnnBackendCache::RetrieveGraphInfo(const INFO& info) {
+  // get graph name from metadata
+  graph_names_.push_back(info.graphName);
+  // get graph inputs from metadata
+  uint32_t numGraphInputs = info.numGraphInputs;
+  input_tensor_structs_[graph_names_.back()].reserve(numGraphInputs);
+  for (std::uint32_t i = 0; i < numGraphInputs; ++i) {
+    input_tensor_structs_[graph_names_.back()].emplace_back(
+        info.graphInputs[i]);
+  }
+  // get graph outputs from metadata
+  uint32_t numGraphOutputs = info.numGraphOutputs;
+  output_tensor_structs_[graph_names_.back()].reserve(numGraphOutputs);
+  for (std::uint32_t i = 0; i < numGraphOutputs; ++i) {
+    output_tensor_structs_[graph_names_.back()].emplace_back(
+        info.graphOutputs[i]);
+  }
 }
+
 } // namespace qnn
 } // namespace backends
 } // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h
index 26af927fbd8..b9e00f0a662 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h
@@ -11,7 +11,9 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnSysImplementation.h>
 
 #include <string>
+#include <unordered_map>
 #include <vector>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
@@ -23,17 +25,19 @@ class QnnBackendCache {
     DESERIALIZE = 2,
     ONLINE_PREPARE = 3,
   };
-  explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob)
-      : qnn_context_blob_(qnn_context_blob) {}
+  explicit QnnBackendCache(
+      const QnnExecuTorchContextBinary& qnn_context_blob,
+      const std::string& aot_graph_name)
+      : qnn_context_blob_(qnn_context_blob), aot_graph_name_(aot_graph_name) {}
   virtual ~QnnBackendCache();
   QnnBackendCache(const QnnBackendCache&) = delete;
   QnnBackendCache(QnnBackendCache&&) = delete;
   QnnBackendCache& operator=(const QnnBackendCache&) = delete;
   QnnBackendCache& operator=(QnnBackendCache&&) = delete;
 
-  std::vector<Qnn_Tensor_t> GetGraphInputs();
+  std::vector<Qnn_Tensor_t> GetGraphInputs(const std::string& graph_name);
 
-  std::vector<Qnn_Tensor_t> GetGraphOutputs();
+  std::vector<Qnn_Tensor_t> GetGraphOutputs(const std::string& graph_name);
 
   const QnnExecuTorchContextBinary& GetQnnContextBlob() {
     return qnn_context_blob_;
@@ -47,8 +51,8 @@ class QnnBackendCache {
     state_ = INVALID;
   }
 
-  std::string GetGraphName() {
-    return graph_name_;
+  std::vector<std::string> GetGraphNames() {
+    return graph_names_;
   }
 
   executorch::runtime::Error Configure();
@@ -60,16 +64,24 @@ class QnnBackendCache {
   }
 
  private:
-  executorch::runtime::Error GetQnnGraphInfoFromBinary();
+  executorch::runtime::Error GetQnnGraphInfoFromBinary(
+      void* buffer,
+      uint32_t nbytes);
+
+  template <typename INFO>
+  void RetrieveGraphInfo(const INFO& info);
 
   CacheState state_{INVALID};
 
   QnnExecuTorchContextBinary qnn_context_blob_;
   QnnSystemContext_Handle_t sys_context_handle_{nullptr};
   QnnSystemImplementation qnn_sys_impl_{"libQnnSystem.so"};
-  std::string graph_name_;
-  std::vector<Qnn_Tensor_t> input_tensor_structs_;
-  std::vector<Qnn_Tensor_t> output_tensor_structs_;
+  std::vector<std::string> graph_names_;
+  std::string aot_graph_name_;
+  std::unordered_map<std::string, std::vector<Qnn_Tensor_t>>
+      input_tensor_structs_;
+  std::unordered_map<std::string, std::vector<Qnn_Tensor_t>>
+      output_tensor_structs_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index 7b1cb2c2399..29e6686740b 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -60,7 +60,8 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           implementation, logger, options->soc_info(), htp_options);
 
       backend_params->qnn_backend_cache_ptr_ =
-          std::make_unique<HtpBackendCache>(qnn_context_blob);
+          std::make_unique<HtpBackendCache>(
+              qnn_context_blob, options->graph_name()->str());
 
       backend_params->qnn_context_ptr_ = std::make_unique<HtpContext>(
           implementation,
@@ -74,7 +75,6 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           backend_params->qnn_backend_ptr_.get(),
           backend_params->qnn_context_ptr_.get(),
           options->profile_level(),
-          options->graph_name()->str(),
           options->soc_info(),
           htp_options);
       backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
index f4a4ebf8991..012c2cc7b5b 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
@@ -7,6 +7,7 @@
  */
 #pragma once
 
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCache.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
@@ -21,7 +22,6 @@
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpDevice.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h>
-#include <executorch/backends/qualcomm/schema_generated.h>
 
 #include <memory>
 namespace executorch {
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
index 8064dd69abe..7db5164a1d5 100644
--- a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
+++ b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
@@ -45,12 +46,13 @@ Error QnnContext::Configure() {
   if (cache_->GetCacheState() == QnnBackendCache::DESERIALIZE) {
     const QnnExecuTorchContextBinary& qnn_context_blob =
         cache_->GetQnnContextBlob();
+    auto binary_info = GetBinaryInfo(qnn_context_blob.buffer);
     error = qnn_interface.qnn_context_create_from_binary(
         backend_->GetHandle(),
         device_->GetHandle(),
         temp_context_config.empty() ? nullptr : temp_context_config.data(),
-        qnn_context_blob.buffer,
-        qnn_context_blob.nbytes,
+        const_cast<uint8_t*>(binary_info->data()->data()),
+        binary_info->data()->size(),
         &handle_,
         /*profile=*/nullptr);
     if (error != QNN_SUCCESS) {
@@ -92,7 +94,7 @@ Error QnnContext::GetContextBinary(
   Qnn_ErrorHandle_t error =
       qnn_interface.qnn_context_get_binary_size(handle_, &binary_size);
   if (error == QNN_SUCCESS) {
-    binary_buffer_.reserve(binary_size);
+    binary_buffer_.resize(binary_size);
     error = qnn_interface.qnn_context_get_binary(
         handle_, binary_buffer_.data(), binary_size, &bytes_written);
     if (error != QNN_SUCCESS) {
@@ -110,8 +112,18 @@ Error QnnContext::GetContextBinary(
             binary_size);
         return Error::Internal;
       }
-      qnn_executorch_context_binary.buffer = binary_buffer_.data();
-      qnn_executorch_context_binary.nbytes = bytes_written;
+
+      auto signature = []() {
+        return std::to_string(std::chrono::high_resolution_clock::now()
+                                  .time_since_epoch()
+                                  .count());
+      };
+      builder_.Reset();
+      auto binary_info = qnn_delegate::CreateBinaryInfoDirect(
+          builder_, signature().c_str(), &binary_buffer_);
+      builder_.Finish(binary_info);
+      qnn_executorch_context_binary.buffer = builder_.GetBufferPointer();
+      qnn_executorch_context_binary.nbytes = builder_.GetSize();
     }
   } else {
     QNN_EXECUTORCH_LOG_ERROR(
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h
index 970b3901c6b..d93390a5379 100644
--- a/backends/qualcomm/runtime/backends/QnnContextCommon.h
+++ b/backends/qualcomm/runtime/backends/QnnContextCommon.h
@@ -7,6 +7,7 @@
  */
 #pragma once
 
+#include <executorch/backends/qualcomm/qc_binary_info_generated.h>
 #include <executorch/backends/qualcomm/runtime/Logging.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCache.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
@@ -36,15 +37,17 @@ class QnnContext {
     return handle_;
   }
 
-  std::string GetGraphName() {
-    return cache_->GetGraphName();
+  std::vector<std::string> inline GetGraphNames() {
+    return cache_->GetGraphNames();
   }
 
-  std::vector<Qnn_Tensor_t> GetGraphInputs() {
-    return cache_->GetGraphInputs();
+  std::vector<Qnn_Tensor_t> inline GetGraphInputs(
+      const std::string& graph_name) {
+    return cache_->GetGraphInputs(graph_name);
   }
-  std::vector<Qnn_Tensor_t> GetGraphOutputs() {
-    return cache_->GetGraphOutputs();
+  std::vector<Qnn_Tensor_t> inline GetGraphOutputs(
+      const std::string& graph_name) {
+    return cache_->GetGraphOutputs(graph_name);
   }
   QnnBackendCache::CacheState GetCacheState() const {
     return cache_->GetCacheState();
@@ -68,7 +71,8 @@ class QnnContext {
   QnnBackend* backend_;
   QnnDevice* device_;
   QnnBackendCache* cache_;
-  std::vector<char> binary_buffer_;
+  std::vector<uint8_t> binary_buffer_;
+  flatbuffers::FlatBufferBuilder builder_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/backends/QnnDeviceCommon.h b/backends/qualcomm/runtime/backends/QnnDeviceCommon.h
index a6a4cc97817..85de00f8623 100644
--- a/backends/qualcomm/runtime/backends/QnnDeviceCommon.h
+++ b/backends/qualcomm/runtime/backends/QnnDeviceCommon.h
@@ -7,10 +7,10 @@
  */
 #pragma once
 
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/backends/qualcomm/runtime/Logging.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnLogger.h>
-#include <executorch/backends/qualcomm/schema_generated.h>
 
 #include <vector>
 
diff --git a/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp b/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp
index 7215472df3d..6da525b4d02 100644
--- a/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp
+++ b/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp
@@ -12,7 +12,7 @@ namespace qnn {
 
 using executorch::runtime::Error;
 
-Error QnnGraph::Configure() {
+Error QnnGraph::Configure(const std::string& graph_name) {
   // create qnn backend
   const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -23,15 +23,22 @@ Error QnnGraph::Configure() {
       Internal,
       "Fail to make graph config.");
 
+  if (handle_.count(graph_name)) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Graph '%s' has been configured.", graph_name.c_str());
+    return Error::Ok;
+  }
+
+  Qnn_GraphHandle_t graph_handle = nullptr;
   if (context_->GetCacheState() == QnnBackendCache::DESERIALIZE) {
     // retrieve QNN Graph
     error = qnn_interface.qnn_graph_retrieve(
-        context_->GetHandle(), context_->GetGraphName().c_str(), &handle_);
+        context_->GetHandle(), graph_name.c_str(), &graph_handle);
     if (error != QNN_SUCCESS) {
       QNN_EXECUTORCH_LOG_ERROR(
           "Can't retrieve graph "
           "%s from context. Error %d.",
-          context_->GetGraphName().c_str(),
+          graph_name.c_str(),
           QNN_GET_ERROR_CODE(error));
       return Error::Internal;
     }
@@ -40,9 +47,9 @@ Error QnnGraph::Configure() {
       context_->GetCacheState() == QnnBackendCache::ONLINE_PREPARE) {
     Qnn_ErrorHandle_t error = qnn_interface.qnn_graph_create(
         context_->GetHandle(),
-        graph_name_.c_str(),
+        graph_name.c_str(),
         temp_graph_config.empty() ? nullptr : temp_graph_config.data(),
-        &handle_);
+        &graph_handle);
 
     if (error != QNN_SUCCESS) {
       QNN_EXECUTORCH_LOG_ERROR(
@@ -54,26 +61,36 @@ Error QnnGraph::Configure() {
     return Error::Internal;
   }
 
+  // book keep valid handle of created graph
+  handle_[graph_name] = graph_handle;
   // The profiler needs to be created after the backend is created.
-  profile_ =
+  profile_[graph_name] =
       std::make_unique<QnnProfile>(implementation_, backend_, profile_level_);
   return Error::Ok;
 }
 
 Qnn_ErrorHandle_t QnnGraph::GraphExecute(
+    const std::string& graph_name,
     const std::vector<Qnn_Tensor_t>& input_tensor_structs,
     std::vector<Qnn_Tensor_t>& output_tensor_structs) {
+  if (!handle_.count(graph_name)) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "graph name: %s does not exist.", graph_name.c_str());
+    return QNN_COMMON_ERROR_GENERAL;
+  }
+
   return implementation_.GetQnnInterface().qnn_graph_execute(
-      handle_,
+      handle_[graph_name],
       input_tensor_structs.data(),
       input_tensor_structs.size(),
       output_tensor_structs.data(),
       output_tensor_structs.size(),
-      profile_->GetHandle(),
+      profile_[graph_name]->GetHandle(),
       /*signalHandle=*/nullptr);
 };
 
 Error QnnGraph::EnsureTensorInQnnGraph(
+    const std::string& graph_name,
     const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
   const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -81,7 +98,8 @@ Error QnnGraph::EnsureTensorInQnnGraph(
   if (!tensor_wrapper->IsTensorCreated()) {
     Qnn_Tensor_t tensor = tensor_wrapper->CloneTensorStruct();
 
-    error = qnn_interface.qnn_tensor_create_graph_tensor(handle_, &tensor);
+    error = qnn_interface.qnn_tensor_create_graph_tensor(
+        handle_[graph_name], &tensor);
 
     int name_conflict_count = 0;
     while (error == QNN_TENSOR_ERROR_NAME_HASH_COLLISION) {
@@ -99,7 +117,8 @@ Error QnnGraph::EnsureTensorInQnnGraph(
 
       // update
       name_conflict_count++;
-      error = qnn_interface.qnn_tensor_create_graph_tensor(handle_, &tensor);
+      error = qnn_interface.qnn_tensor_create_graph_tensor(
+          handle_[graph_name], &tensor);
     }
     tensor_wrapper->UpdateQnnTensorMeta(tensor);
     tensor_wrapper->SetTensorCreated();
diff --git a/backends/qualcomm/runtime/backends/QnnGraphCommon.h b/backends/qualcomm/runtime/backends/QnnGraphCommon.h
index b8cd6c6fab8..62d9b1b9e1a 100644
--- a/backends/qualcomm/runtime/backends/QnnGraphCommon.h
+++ b/backends/qualcomm/runtime/backends/QnnGraphCommon.h
@@ -26,44 +26,48 @@ class QnnGraph {
       const QnnImplementation& implementation,
       QnnBackend* backend,
       QnnContext* context,
-      const QnnExecuTorchProfileLevel& profile_level,
-      const std::string& graph_name)
-      : handle_(nullptr),
-        implementation_(implementation),
+      const QnnExecuTorchProfileLevel& profile_level)
+      : implementation_(implementation),
         backend_(backend),
         context_(context),
-        profile_level_(profile_level),
-        graph_name_(graph_name) {}
+        profile_level_(profile_level) {}
 
   virtual ~QnnGraph(){};
 
-  executorch::runtime::Error Configure();
+  executorch::runtime::Error Configure(const std::string& graph_name);
 
   Qnn_ErrorHandle_t GraphExecute(
+      const std::string& graph_name,
       const std::vector<Qnn_Tensor_t>& input_tensor_structs,
       std::vector<Qnn_Tensor_t>& output_tensor_structs);
 
-  Qnn_ErrorHandle_t GraphAddNode(const Qnn_OpConfig_t& op_config) {
+  Qnn_ErrorHandle_t GraphAddNode(
+      const std::string& graph_name,
+      const Qnn_OpConfig_t& op_config) {
     return implementation_.GetQnnInterface().qnn_graph_add_node(
-        handle_, op_config);
+        handle_[graph_name], op_config);
   };
   executorch::runtime::Error EnsureTensorInQnnGraph(
+      const std::string& graph_name,
       const std::shared_ptr<TensorWrapper>& tensor_wrapper);
 
-  Qnn_ErrorHandle_t GraphFinalize() {
+  Qnn_ErrorHandle_t GraphFinalize(const std::string& graph_name) {
     return implementation_.GetQnnInterface().qnn_graph_finalize(
-        handle_, profile_->GetHandle(), nullptr /* signal_handle */);
+        handle_[graph_name],
+        profile_[graph_name]->GetHandle(),
+        nullptr /* signal_handle */);
   };
   Qnn_ErrorHandle_t ProfileExecuteData(
+      const std::string& graph_name,
       executorch::runtime::EventTracer* event_tracer) {
-    return profile_->ProfileData(event_tracer);
+    return profile_[graph_name]->ProfileData(event_tracer);
   };
-  Qnn_GraphHandle_t GetHandle() {
-    return handle_;
+  Qnn_GraphHandle_t GetHandle(const std::string& graph_name) {
+    return handle_[graph_name];
   }
 
-  QnnProfile* GetProfile() {
-    return profile_.get();
+  QnnProfile* GetProfile(const std::string& graph_name) {
+    return profile_[graph_name].get();
   }
 
  protected:
@@ -73,13 +77,12 @@ class QnnGraph {
   };
 
  private:
-  Qnn_GraphHandle_t handle_;
+  std::unordered_map<std::string, Qnn_GraphHandle_t> handle_;
   const QnnImplementation& implementation_;
   QnnBackend* backend_;
   QnnContext* context_;
   QnnExecuTorchProfileLevel profile_level_;
-  std::string graph_name_;
-  std::unique_ptr<QnnProfile> profile_;
+  std::unordered_map<std::string, std::unique_ptr<QnnProfile>> profile_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/backends/QnnLogger.h b/backends/qualcomm/runtime/backends/QnnLogger.h
index 09c74b53c60..80be4f61c59 100644
--- a/backends/qualcomm/runtime/backends/QnnLogger.h
+++ b/backends/qualcomm/runtime/backends/QnnLogger.h
@@ -7,8 +7,8 @@
  */
 #pragma once
 
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
-#include <executorch/backends/qualcomm/schema_generated.h>
 namespace executorch {
 namespace backends {
 namespace qnn {
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
index faad456aed4..4dd6897f74a 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
@@ -13,8 +13,10 @@ namespace backends {
 namespace qnn {
 class HtpBackendCache : public QnnBackendCache {
  public:
-  explicit HtpBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob)
-      : QnnBackendCache(qnn_context_blob), spill_fill_buf_(0) {}
+  explicit HtpBackendCache(
+      const QnnExecuTorchContextBinary& qnn_context_blob,
+      const std::string& aot_graph_name)
+      : QnnBackendCache(qnn_context_blob, aot_graph_name), spill_fill_buf_(0) {}
   ~HtpBackendCache() override = default;
 
   uint64_t GetSpillFillBufferSize() {
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h
index a618100fcd1..f0d4873b0d2 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
-#include <executorch/backends/qualcomm/schema_generated.h>
 
 #include <memory>
 #include <vector>
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpDeviceCustomConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpDeviceCustomConfig.h
index 93314388886..e383c4bd460 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpDeviceCustomConfig.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpDeviceCustomConfig.h
@@ -7,7 +7,7 @@
  */
 #pragma once
 
-#include <executorch/backends/qualcomm/schema_generated.h>
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 
 #include <memory>
 #include <vector>
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpDevicePlatformInfoConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpDevicePlatformInfoConfig.h
index 9b75215ff8e..74d282c86e2 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpDevicePlatformInfoConfig.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpDevicePlatformInfoConfig.h
@@ -7,7 +7,7 @@
  */
 #pragma once
 
-#include <executorch/backends/qualcomm/schema_generated.h>
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 
 #include <memory>
 #include <vector>
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h b/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h
index 8f0f56215d5..c3add50d08b 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h
@@ -23,10 +23,9 @@ class HtpGraph : public QnnGraph {
       QnnBackend* backend,
       QnnContext* context,
       const QnnExecuTorchProfileLevel& profile_level,
-      const std::string& graph_name,
       const SocInfo* soc_info,
       const QnnExecuTorchHtpBackendOptions* htp_options)
-      : QnnGraph(implementation, backend, context, profile_level, graph_name),
+      : QnnGraph(implementation, backend, context, profile_level),
         qcom_target_soc_info_(soc_info),
         htp_options_(htp_options) {
     htp_graph_custom_config_ =
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.cpp
index 013403fa73d..d43f8320285 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.cpp
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.cpp
@@ -12,8 +12,9 @@ namespace executorch {
 namespace backends {
 namespace qnn {
 std::vector<QnnGraph_CustomConfig_t>
-HtpGraphCustomConfig::CreateGraphCustomConfig(
-    const SocInfo* qcom_target_soc_info) {
+HtpGraphCustomConfig::CreateGraphCustomConfigCommon(
+    const SocInfo* qcom_target_soc_info,
+    float opt_level) {
   std::vector<QnnGraph_CustomConfig_t> ret;
   QnnHtpGraph_CustomConfig_t* p_custom_config = nullptr;
 
@@ -45,8 +46,6 @@ HtpGraphCustomConfig::CreateGraphCustomConfig(
       break;
   }
 
-  float opt_level =
-      context_->GetCacheState() == QnnBackendCache::ONLINE_PREPARE ? 1 : 3;
   QNN_EXECUTORCH_LOG_INFO(
       "Running level=%d optimization.", static_cast<int>(opt_level));
 
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h
index bf038bc4b9b..4a8e78ce673 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h
@@ -8,7 +8,7 @@
 #pragma once
 #include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
 
-#include <executorch/backends/qualcomm/schema_generated.h>
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 
 #include <memory>
 #include <vector>
@@ -35,6 +35,9 @@ class HtpGraphCustomConfig {
     htp_graph_config_.back()->option = QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN;
     return htp_graph_config_.back().get();
   }
+  std::vector<QnnGraph_CustomConfig_t> CreateGraphCustomConfigCommon(
+      const SocInfo* qcom_target_soc_info,
+      float opt_level);
 
   [[maybe_unused]] const QnnExecuTorchHtpBackendOptions* htp_options_;
   std::vector<std::unique_ptr<QnnHtpGraph_CustomConfig_t>> htp_graph_config_;
diff --git a/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpGraphCustomConfig.cpp
new file mode 100644
index 00000000000..096fda7b059
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpGraphCustomConfig.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+std::vector<QnnGraph_CustomConfig_t>
+HtpGraphCustomConfig::CreateGraphCustomConfig(
+    const SocInfo* qcom_target_soc_info) {
+  return CreateGraphCustomConfigCommon(qcom_target_soc_info, 1);
+}
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpContextCustomConfig.cpp
index 6b5266cf23b..1fc2940eaa7 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpContextCustomConfig.cpp
+++ b/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpContextCustomConfig.cpp
@@ -14,7 +14,18 @@ namespace qnn {
 
 std::vector<QnnContext_CustomConfig_t>
 HtpContextCustomConfig::CreateContextCustomConfig() {
-  return {};
+  std::vector<QnnContext_CustomConfig_t> ret;
+  QnnHtpContext_CustomConfig_t* p_custom_config = nullptr;
+
+  if (htp_options_->use_weight_sharing()) {
+    p_custom_config = AllocContextCustomConfig();
+    p_custom_config->option =
+        QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED;
+    p_custom_config->weightSharingEnabled = true;
+    ret.push_back(static_cast<QnnContext_CustomConfig_t>(p_custom_config));
+  }
+
+  return ret;
 }
 
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpGraphCustomConfig.cpp
new file mode 100644
index 00000000000..330ca43e20b
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/htpbackend/x86_64/HtpGraphCustomConfig.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpGraphCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+std::vector<QnnGraph_CustomConfig_t>
+HtpGraphCustomConfig::CreateGraphCustomConfig(
+    const SocInfo* qcom_target_soc_info) {
+  return CreateGraphCustomConfigCommon(qcom_target_soc_info, 3);
+}
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/serialization/qc_binary_info.fbs b/backends/qualcomm/serialization/qc_binary_info.fbs
new file mode 100644
index 00000000000..3f301055269
--- /dev/null
+++ b/backends/qualcomm/serialization/qc_binary_info.fbs
@@ -0,0 +1,20 @@
+//============================================================================
+//
+// Copyright (c) Qualcomm Innovation Center, Inc.
+// All rights reserved
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+//============================================================================
+
+namespace qnn_delegate;
+
+table BinaryInfo {
+  // Signature of binary
+  signature: string;
+  // Data of processed binary
+  data: [ubyte];
+}
+
+root_type BinaryInfo;
diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs
similarity index 91%
rename from backends/qualcomm/serialization/schema.fbs
rename to backends/qualcomm/serialization/qc_compiler_spec.fbs
index e8ef108f3a6..bd097fc5ccd 100644
--- a/backends/qualcomm/serialization/schema.fbs
+++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs
@@ -1,8 +1,10 @@
 //============================================================================
 //
-//  Copyright (c) 2023 Qualcomm Technologies, Inc.
-//  All Rights Reserved.
-//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+// Copyright (c) Qualcomm Innovation Center, Inc.
+// All rights reserved
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
 //
 //============================================================================
 
@@ -118,6 +120,10 @@ table QnnExecuTorchHtpBackendOptions {
   /// pte, it is possible to reserve a single spill-fill allocation that
   /// could be re-used across all the splits.
   use_multi_contexts:bool;
+
+  /// When multiple graphs appear inside the same context,
+  /// weights could be reused across all graphs.
+  use_weight_sharing:bool;
 }
 
 /// Logging level of the delegate and QNN backend.
@@ -177,7 +183,10 @@ table QnnExecuTorchOptions {
   shared_buffer:bool;
 
   /// Is model from qnn context binary
-  is_from_context_binary: bool;
+  is_from_context_binary:bool;
+
+  /// True if there exists multiple graphs in one .pte file.
+  multiple_graphs:bool;
 }
 
 root_type QnnExecuTorchOptions;
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qc_schema.py
similarity index 95%
rename from backends/qualcomm/serialization/qnn_compile_spec_schema.py
rename to backends/qualcomm/serialization/qc_schema.py
index ce139a54cbe..816d8134184 100644
--- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -12,6 +12,12 @@
 from enum import IntEnum, unique
 
 
+@dataclass
+class BinaryInfo:
+    signature: str = ""
+    data: bytes = None
+
+
 @unique
 class HtpArch(IntEnum):
     NONE = 0
@@ -98,6 +104,7 @@ class QnnExecuTorchHtpBackendOptions:
     use_dlbc: bool = False
     use_fold_relu: bool = True
     use_multi_contexts: bool = False
+    use_weight_sharing: bool = False
 
 
 @unique
@@ -136,3 +143,4 @@ class QnnExecuTorchOptions:
     profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
     shared_buffer: bool = False
     is_from_context_binary: bool = False
+    multiple_graphs: bool = False
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_serialize.py b/backends/qualcomm/serialization/qc_schema_serialize.py
similarity index 50%
rename from backends/qualcomm/serialization/qnn_compile_spec_serialize.py
rename to backends/qualcomm/serialization/qc_schema_serialize.py
index 49227628e5e..59610d7b996 100644
--- a/backends/qualcomm/serialization/qnn_compile_spec_serialize.py
+++ b/backends/qualcomm/serialization/qc_schema_serialize.py
@@ -9,41 +9,45 @@
 import tempfile
 
 import pkg_resources
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QnnExecuTorchOptions,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QnnExecuTorchOptions
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
 
 
-def convert_to_flatbuffer(qnn_executorch_options: QnnExecuTorchOptions) -> bytes:
-    qnn_executorch_options_json = json.dumps(
-        qnn_executorch_options, cls=_DataclassEncoder
-    )
+def _convert_to_flatbuffer(obj, schema: str):
+    obj_json = json.dumps(obj, cls=_DataclassEncoder)
     with tempfile.TemporaryDirectory() as d:
-        schema_path = os.path.join(d, "schema.fbs")
+        schema_path = os.path.join(d, f"{schema}.fbs")
         with open(schema_path, "wb") as schema_file:
-            schema_file.write(pkg_resources.resource_string(__name__, "schema.fbs"))
-        json_path = os.path.join(d, "schema.json")
+            schema_file.write(pkg_resources.resource_string(__name__, f"{schema}.fbs"))
+        json_path = os.path.join(d, f"{schema}.json")
         with open(json_path, "wb") as json_file:
-            json_file.write(qnn_executorch_options_json.encode("ascii"))
+            json_file.write(obj_json.encode("ascii"))
 
         _flatc_compile(d, schema_path, json_path)
-        output_path = os.path.join(d, "schema.bin")
+        output_path = os.path.join(d, f"{schema}.bin")
         with open(output_path, "rb") as output_file:
             return output_file.read()
 
 
-def convert_to_option(processed_bytes: bytes) -> QnnExecuTorchOptions:
+def _convert_to_object(flatbuffers: bytes, obj_type, schema: str):
     with tempfile.TemporaryDirectory() as d:
-        json_path = os.path.join(d, "options.json")
-        schema_path = os.path.join(d, "schema.fbs")
-        bin_path = os.path.join(d, "options.bin")
+        json_path = os.path.join(d, f"{schema}.json")
+        schema_path = os.path.join(d, f"{schema}.fbs")
+        bin_path = os.path.join(d, f"{schema}.bin")
         with open(schema_path, "wb") as schema_file:
-            schema_file.write(pkg_resources.resource_string(__name__, "schema.fbs"))
+            schema_file.write(pkg_resources.resource_string(__name__, f"{schema}.fbs"))
         with open(bin_path, "wb") as bin_file:
-            bin_file.write(processed_bytes)
+            bin_file.write(flatbuffers)
 
         _flatc_decompile(d, schema_path, bin_path, ["--raw-binary"])
         with open(json_path, "rb") as output_file:
-            return _json_to_dataclass(json.load(output_file), QnnExecuTorchOptions)
+            return _json_to_dataclass(json.load(output_file), obj_type)
+
+
+def option_to_flatbuffer(qnn_executorch_options: QnnExecuTorchOptions) -> bytes:
+    return _convert_to_flatbuffer(qnn_executorch_options, "qc_compiler_spec")
+
+
+def flatbuffer_to_option(flatbuffers: bytes) -> QnnExecuTorchOptions:
+    return _convert_to_object(flatbuffers, QnnExecuTorchOptions, "qc_compiler_spec")
diff --git a/backends/qualcomm/serialization/targets.bzl b/backends/qualcomm/serialization/targets.bzl
index c3c571109e7..c4a40ba0130 100644
--- a/backends/qualcomm/serialization/targets.bzl
+++ b/backends/qualcomm/serialization/targets.bzl
@@ -6,10 +6,10 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-    
+
     export_file(
-        name = "qnn_schema",
-        src = "schema.fbs",
+        name = "qc_compiler_spec_schema",
+        src = "qc_compiler_spec.fbs",
         visibility = ["//executorch/backends/qualcomm/serialization/..."],
     )
 
@@ -19,7 +19,7 @@ def define_common_targets():
             "*.py",
         ]),
         resources = {
-            ":qnn_schema": "schema.fbs",
+            ":qc_compiler_spec_schema": "qc_compiler_spec.fbs",
         },
         visibility = [
             "@EXECUTORCH_CLIENTS",
diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl
index 929ccd97441..08d163eefc3 100644
--- a/backends/qualcomm/targets.bzl
+++ b/backends/qualcomm/targets.bzl
@@ -6,13 +6,13 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
 
 # Construct the input and output file names. All input and output files rely on scalar_type file.
-SCHEMA_NAME = "schema"
+SCHEMA_NAME = "qc_compiler_spec"
 
 INPUT_SCHEMA = "serialization/" + SCHEMA_NAME + ".fbs"
 
 OUTPUT_SCHEMA_HEADER = SCHEMA_NAME + "_generated.h"
 
-SCHEMA_GEN_RULE_NAME = "schema_generated"
+SCHEMA_GEN_RULE_NAME = "qc_compiler_spec_generated"
 
 SCHEMA_LIRRARY_NAME = SCHEMA_NAME
 
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 62f3ecc3ca2..0ed66329c33 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -549,8 +549,8 @@ def forward(self, x):
 class Index(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.idx0 = torch.tensor([[0, 1], [2, 3], [4, 5]])
-        self.idx1 = torch.tensor([[1, 2], [3, 4], [5, 6]])
+        self.idx0 = torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int32)
+        self.idx1 = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=torch.int32)
 
     def forward(self, x):
         return x[self.idx0] + x[self.idx1]
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 4fed86f5df8..10917cdd6bf 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -29,12 +29,13 @@
 )
 
 from executorch.backends.qualcomm.utils.utils import (
-    canonicalize_program,
     capture_program,
     from_context_binary,
     generate_htp_compiler_spec,
+    generate_multi_graph_program,
     generate_qnn_executorch_compiler_spec,
     skip_annotation,
+    update_spill_fill_size,
 )
 
 from executorch.examples.models.llama.llama_transformer import ModelArgs, MOEFeedForward
@@ -1536,7 +1537,7 @@ def test_qnn_backend_multi_contexts(self):
         )
         partitioner = QnnPartitioner(compiler_specs)
         edge_prog.exported_program = to_backend(edge_prog.exported_program, partitioner)
-        canonicalize_program(edge_prog.exported_program)
+        update_spill_fill_size(edge_prog.exported_program)
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module, sample_input, exec_prog)
 
@@ -1560,10 +1561,54 @@ def test_qnn_backend_multi_contexts_composite(self):
         edge_prog = to_edge(
             torch.export.export(module, sample_input),
         )
-        canonicalize_program(edge_prog.exported_program())
+        update_spill_fill_size(edge_prog.exported_program())
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
+    def test_qnn_backend_multi_graphs(self):
+        if self.enable_x86_64:
+            self.skipTest("weight sharing is not supported on host machine")
+
+        seq_conv = Conv2dSequential()  # noqa: F405
+        # weight sharing
+        modules = [seq_conv, seq_conv.second]
+        sample_inputs = [(torch.randn([1, 1, 3, 3]),), (torch.randn([1, 3, 3, 3]),)]
+        graph_names = ["seq_conv", "single_conv"]
+        edge_progs = [
+            capture_program(module, sample_input)
+            for module, sample_input in zip(modules, sample_inputs)
+        ]
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=True,
+        )
+        compiler_specs = [
+            generate_qnn_executorch_compiler_spec(
+                soc_model=self.chipset_table[TestQNN.model],
+                backend_options=backend_options,
+                multiple_graphs=True,
+                graph_name=graph_name,
+            )
+            for graph_name in graph_names
+        ]
+        exported_programs = [
+            to_backend(edge_prog.exported_program, QnnPartitioner(compiler_specs[i]))
+            for i, edge_prog in enumerate(edge_progs)
+        ]
+        prog_mgr = generate_multi_graph_program(
+            compiler_specs=compiler_specs[0],
+            processed_bytes=[
+                prog.graph_module.lowered_module_0.processed_bytes
+                for prog in exported_programs
+            ],
+        )
+        for index, module in enumerate(modules):
+            self.verify_output(
+                module=module,
+                sample_inputs=sample_inputs[index],
+                executorch_prog=prog_mgr,
+                method_index=index,
+            )
+
     def test_qnn_backend_profile_op(self):
         TestQNN.enable_profile = True
         backend_options = generate_htp_compiler_spec(use_fp16=True)
@@ -1621,22 +1666,13 @@ def test_qnn_backend_context_direct(self):
             )
             ctx_path = f"{tmp_dir}/model_ctx.bin"
             bundle_program = from_context_binary(ctx_path, "ctx_loader")
-            backend_options = generate_htp_compiler_spec(use_fp16=True)
-            compiler_specs = generate_qnn_executorch_compiler_spec(
-                soc_model=self.chipset_table[TestQNN.model],
-                backend_options=backend_options,
-                is_from_context_binary=True,
-            )
-            lowered_module = to_backend(
-                "QnnBackend", bundle_program["edge_program"], compiler_specs
-            )
             self.verify_output(
                 module,
                 tuple(
                     torch.randn(size=v.shape, dtype=v.dtype)
                     for v in bundle_program["inputs"].values()
                 ),
-                lowered_module,
+                bundle_program["edge_program_manager"].to_executorch(),
             )
 
 
@@ -1819,7 +1855,7 @@ def test_qnn_backend_multi_contexts(self):
         )
         partitioner = QnnPartitioner(compiler_specs)
         edge_prog.exported_program = to_backend(edge_prog.exported_program, partitioner)
-        canonicalize_program(edge_prog.exported_program)
+        update_spill_fill_size(edge_prog.exported_program)
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module, sample_input, exec_prog)
 
@@ -1844,10 +1880,54 @@ def test_qnn_backend_multi_contexts_composite(self):
         edge_prog = to_edge(
             torch.export.export(module, sample_input),
         )
-        canonicalize_program(edge_prog.exported_program())
+        update_spill_fill_size(edge_prog.exported_program())
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
+    def test_qnn_backend_multi_graphs(self):
+        if self.enable_x86_64:
+            self.skipTest("weight sharing is not supported on host machine")
+
+        seq_conv = Conv2dSequential()  # noqa: F405
+        # weight sharing
+        modules = [seq_conv, seq_conv.second]
+        sample_inputs = [(torch.randn([1, 1, 3, 3]),), (torch.randn([1, 3, 3, 3]),)]
+        graph_names = ["seq_conv", "single_conv"]
+        edge_progs = [
+            capture_program(self.get_qdq_module(module, sample_input), sample_input)
+            for module, sample_input in zip(modules, sample_inputs)
+        ]
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=True,
+        )
+        compiler_specs = [
+            generate_qnn_executorch_compiler_spec(
+                soc_model=self.chipset_table[TestQNN.model],
+                backend_options=backend_options,
+                multiple_graphs=True,
+                graph_name=graph_name,
+            )
+            for graph_name in graph_names
+        ]
+        exported_programs = [
+            to_backend(edge_prog.exported_program, QnnPartitioner(compiler_specs[i]))
+            for i, edge_prog in enumerate(edge_progs)
+        ]
+        prog_mgr = generate_multi_graph_program(
+            compiler_specs=compiler_specs[0],
+            processed_bytes=[
+                prog.graph_module.lowered_module_0.processed_bytes
+                for prog in exported_programs
+            ],
+        )
+        for index, module in enumerate(modules):
+            self.verify_output(
+                module=module,
+                sample_inputs=sample_inputs[index],
+                executorch_prog=prog_mgr,
+                method_index=index,
+            )
+
     def test_qnn_backend_profile_op(self):
         TestQNN.enable_profile = True
         backend_options = generate_htp_compiler_spec(use_fp16=False)
@@ -1908,22 +1988,13 @@ def test_qnn_backend_context_direct(self):
             )
             ctx_path = f"{tmp_dir}/model_ctx.bin"
             bundle_program = from_context_binary(ctx_path, "ctx_loader")
-            backend_options = generate_htp_compiler_spec(use_fp16=False)
-            compiler_specs = generate_qnn_executorch_compiler_spec(
-                soc_model=self.chipset_table[TestQNN.model],
-                backend_options=backend_options,
-                is_from_context_binary=True,
-            )
-            lowered_module = to_backend(
-                "QnnBackend", bundle_program["edge_program"], compiler_specs
-            )
             self.verify_output(
                 module,
                 tuple(
                     torch.randn(size=v.shape, dtype=v.dtype)
                     for v in bundle_program["inputs"].values()
                 ),
-                lowered_module,
+                bundle_program["edge_program_manager"].to_executorch(),
             )
 
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index d2a3e7c2417..96591eb8906 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -9,7 +9,7 @@
 import subprocess
 import tempfile
 import unittest
-from typing import Callable, Dict, List, Literal, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -18,9 +18,7 @@
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.qnn_preprocess import QnnBackend
 from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
     get_soc_to_chipset_map,
@@ -35,7 +33,6 @@
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager
@@ -114,19 +111,19 @@ def generate_context_binary(
 class TestQNN(unittest.TestCase):
     rtol: float = 0
     atol: float = 0
-    host: Literal = ""
-    device: Literal = ""
-    build_folder: Literal = ""
+    host: str = ""
+    device: str = ""
+    build_folder: str = ""
     model: QcomChipset = None
     compiler_specs: List[CompileSpec] = None
     chipset_table = get_soc_to_chipset_map()
     error_only = False
     ip = "localhost"
     port = 8080
-    executorch_root: Literal = ""
-    artifact_dir: Literal = ""
-    image_dataset: Literal = ""
-    pretrained_weight: Literal = ""
+    executorch_root: str = ""
+    artifact_dir: str = ""
+    image_dataset: str = ""
+    pretrained_weight: str = ""
     enable_profile: bool = False
     online_prepare: bool = False
     use_8a8w: str = "8a8w"
@@ -150,7 +147,7 @@ def _save_model_and_expected_output(
         module: torch.nn.Module,
         buffer: exir.ExirExportedProgram,
         inputs: Tuple[torch.Tensor],
-        dir_name: Literal,
+        dir_name: str,
     ) -> None:
         # Save the input data list to be executed
         input_list = ""
@@ -181,26 +178,20 @@ def verify_output(  # noqa: C901
         self,
         module: torch.nn.Module,
         sample_inputs: Tuple[torch.Tensor],
-        executorch_prog: ExecutorchProgram | LoweredBackendModule,
+        executorch_prog: ExecutorchProgram | ExecutorchProgramManager,
         etrecord_path: str = "etrecord.bin",
         expected_profile_events: int = -1,
         expected_intermediate_events: int = -1,
+        method_index: int = 0,
     ):
         with tempfile.TemporaryDirectory() as tmp_dir:
-            buffer = (
-                executorch_prog.buffer
-                if isinstance(
-                    executorch_prog, (ExecutorchProgram, ExecutorchProgramManager)
-                )
-                else executorch_prog.buffer()
-            )
             (
                 input_list,
                 ref_outputs,
                 pte_fname,
             ) = self._save_model_and_expected_output(
                 module,
-                buffer,
+                executorch_prog.buffer,
                 sample_inputs,
                 tmp_dir,
             )
@@ -253,11 +244,13 @@ def validate_intermediate_tensor():
                     # qnn_executor_runner
                     f"{build_folder}/examples/qualcomm/executor_runner/qnn_executor_runner",
                     "--model_path",
-                    f"{pte_fname}",
+                    pte_fname,
                     "--input_list_path",
                     f"{tmp_dir}/input_list.txt",
                     "--output_folder_path",
-                    f"{output_dir}",
+                    output_dir,
+                    "--method_index",
+                    str(method_index),
                 ]
                 if expected_intermediate_events != -1:
                     cmd.append("--dump_intermediate_outputs")
@@ -305,7 +298,7 @@ def validate_intermediate_tensor():
                     ),
                 )
                 adb.push(inputs=[sample_inputs], input_list=input_list)
-                adb.execute()
+                adb.execute(method_index=method_index)
                 adb.pull(output_path=tmp_dir, callback=post_process)
                 self._assert_outputs_equal(outputs, ref_outputs)
 
@@ -343,7 +336,6 @@ def lower_module_and_test_output(
         )
         exec_prog = delegated_program.to_executorch(
             exir.ExecutorchBackendConfig(
-                extract_delegate_segments=False,
                 # For shared buffer, user must pass the memory address
                 # which is allocated by RPC memory to executor runner.
                 # Therefore, won't want to pre-allocate
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index cb54412add0..4bda07fdc2b 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -7,7 +7,7 @@
 import operator
 import warnings
 from collections import OrderedDict
-from typing import Callable, Dict, FrozenSet, List, Set, Tuple
+from typing import Callable, Dict, FrozenSet, List, Tuple
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
 
@@ -50,7 +50,11 @@
     QNN_TENSOR_TYPE_MAP,
 )
 from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
+from executorch.backends.qualcomm.partition.qnn_partitioner import (
+    generate_qnn_executorch_option,
+    QnnPartitioner,
+)
+from executorch.backends.qualcomm.serialization.qc_schema import (
     _soc_info_table,
     HtpArch,
     QcomChipset,
@@ -63,9 +67,9 @@
     QnnExecuTorchOptions,
     QnnExecuTorchProfileLevel,
 )
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import (
-    convert_to_flatbuffer,
-    convert_to_option,
+from executorch.backends.qualcomm.serialization.qc_schema_serialize import (
+    flatbuffer_to_option,
+    option_to_flatbuffer,
 )
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_PASS_EXPAND_BROADCAST_SHAPE,
@@ -74,8 +78,14 @@
     QCOM_QUANTIZED_IO,
 )
 
-from executorch.exir import ExirExportedProgram
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchProgramManager,
+    ExirExportedProgram,
+    to_edge,
+)
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.capture import ExecutorchBackendConfig
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.program._program import _get_updated_graph_signature
 from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
@@ -202,9 +212,8 @@ def replace_linear(module: torch.nn.Module):
     return replace_linear(module)
 
 
-def canonicalize_program(
+def update_spill_fill_size(
     exported_program: ExportedProgram | List[LoweredBackendModule],
-    custom_buffer_size=None,
 ):
     # check if user specifies to use multi_contexts
     # this is a generic approach in case there exists multiple backends
@@ -213,7 +222,7 @@ def process_exported_program(prog):
             max_sf_buf_size, module_map = 0, {}
             for _, m in prog.graph_module._modules.items():
                 # currently only 1 compile spec is expected in each partition
-                options = convert_to_option(m.compile_specs[0].value)
+                options = flatbuffer_to_option(m.compile_specs[0].value)
                 if (
                     options.backend_options.backend_type
                     == QnnExecuTorchBackendType.kHtpBackend
@@ -235,14 +244,10 @@ def process_lowered_module(module):
                 module.compile_specs[0].value, module.processed_bytes
             )
             assert qnn_mgr.Init().value == 0, "failed to load context binary"
-            spill_fill_size = (
-                qnn_mgr.GetSpillFillBufferSize()
-                if custom_buffer_size is None
-                else custom_buffer_size
-            )
+            spill_fill_size = qnn_mgr.GetSpillFillBufferSize()
             qnn_mgr.Destroy()
             return spill_fill_size, {
-                module: convert_to_option(module.compile_specs[0].value)
+                module: flatbuffer_to_option(module.compile_specs[0].value)
             }
 
         dispatch = {
@@ -253,7 +258,7 @@ def process_lowered_module(module):
 
     def update_program(max_sf_buf_size, module_map):
         def set_spec(module, options):
-            spec = CompileSpec(QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(options))
+            spec = CompileSpec(QCOM_QNN_COMPILE_SPEC, option_to_flatbuffer(options))
             if isinstance(module, ExportedProgram):
                 module.compile_specs[0] = spec
             else:
@@ -331,11 +336,10 @@ def _transform(
 def capture_program(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
-    custom_pass_config: Set[str] = frozenset(),
+    custom_pass_config: FrozenSet[str] = frozenset(),
 ) -> exir.ExirExportedProgram:
     ep = torch.export.export(module, inputs)
     decomposed_ep = ep.run_decompositions(get_decomp_table())
-
     # We choose call_operator by target in ConvertBinaryOpsWithScalar
     # because it is the same source_fn_stack for MultiheadAttention
     # TODO: Should modify the scalar op in the op builder instead of
@@ -529,11 +533,11 @@ def skip_annotation(
     Returns:
         exported_programs: List of programs lowered to QnnBackend (quantized graphs only).
     """
-    from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
+    from executorch.backends.qualcomm.serialization.qc_schema import (
         QnnExecuTorchHtpPrecision,
     )
-    from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import (
-        convert_to_option,
+    from executorch.backends.qualcomm.serialization.qc_schema_serialize import (
+        flatbuffer_to_option,
     )
     from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
     from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
@@ -585,14 +589,14 @@ def prepare_subgm(subgm, subgm_name):
             qnn_option = generate_qnn_executorch_option(
                 partitioner.compiler_specs_snapshot
             )
-            compile_option = convert_to_option(qnn_option)
+            compile_option = flatbuffer_to_option(qnn_option)
             htp_options = compile_option.backend_options.htp_options
             htp_options.precision = QnnExecuTorchHtpPrecision.kHtpFp16
             partitioner.delegation_spec = DelegationSpec(
                 "QnnBackend",
                 [
                     CompileSpec(
-                        QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(compile_option)
+                        QCOM_QNN_COMPILE_SPEC, option_to_flatbuffer(compile_option)
                     )
                 ],
             )
@@ -626,9 +630,14 @@ def prepare_subgm(subgm, subgm_name):
     return graph_module, exported_progs
 
 
-def from_context_binary(
-    ctx_path: str, op_name: str, soc_model: QcomChipset = QcomChipset.SM8650
+def from_context_binary(  # noqa: C901
+    ctx_path: str | bytes,
+    op_name: str,
+    soc_model: QcomChipset = QcomChipset.SM8650,
+    custom_info: Dict = None,
 ):
+    from pathlib import Path
+
     def implement_op(custom_op, op_name, outputs):
         @torch.library.impl(
             custom_op, str(op_name), dispatch_key="CompositeExplicitAutograd"
@@ -661,7 +670,7 @@ def forward(self, *inputs):
         return {
             "custom_op": custom_op,
             "custom_module": model,
-            "edge_program": prog,
+            "exported_program": prog,
         }
 
     def build_tensor(tensors, dtype_map):
@@ -674,8 +683,12 @@ def build_tensor(tensors, dtype_map):
 
         return ret
 
-    with open(ctx_path, "rb") as f:
-        ctx_bin = f.read()
+    def preprocess_binary(ctx_bin, compiler_specs):
+        qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+            generate_qnn_executorch_option(compiler_specs),
+        )
+        return bytes(qnn_mgr.MakeBinaryInfo(ctx_bin))
+
     # dummy compiler spec would be fine, since we're not compiling
     backend_options = generate_htp_compiler_spec(use_fp16=False)
     compiler_specs = generate_qnn_executorch_compiler_spec(
@@ -683,26 +696,57 @@ def build_tensor(tensors, dtype_map):
         backend_options=backend_options,
         is_from_context_binary=True,
     )
-    # get context-binary io tensor info through qnn manager
-    qnn_mgr = PyQnnManagerAdaptor.QnnManager(
-        generate_qnn_executorch_option(compiler_specs), ctx_bin
+
+    ctx_bin = (
+        ctx_path
+        if not isinstance(ctx_path, str)
+        else preprocess_binary(Path(f"{ctx_path}").read_bytes(), compiler_specs)
     )
-    assert qnn_mgr.Init().value == 0, "failed to load context binary"
-    qnn_mgr.AllocateTensor()
+
     dtype_map = {}
     for type_map in (QNN_QUANT_TYPE_MAP, QNN_TENSOR_TYPE_MAP):
         for k, v in type_map.items():
             dtype_map.setdefault(v, k)
-    inputs = build_tensor(qnn_mgr.GetGraphInputs(), dtype_map)
-    outputs = build_tensor(qnn_mgr.GetGraphOutputs(), dtype_map)
-    qnn_mgr.Destroy()
+
+    if custom_info is not None:
+        # since some context binaries might fail to open on host
+        # if they are compiled with special flags:
+        # e.g. weight sharing
+        # use custom information here instead
+        inputs = build_tensor(custom_info["graph_inputs"], dtype_map)
+        outputs = build_tensor(custom_info["graph_outputs"], dtype_map)
+        graph_name = custom_info["graph_name"]
+    else:
+        # get context-binary io tensor info through qnn manager
+        qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+            generate_qnn_executorch_option(compiler_specs),
+            ctx_bin,
+        )
+        assert qnn_mgr.Init().value == 0, "failed to load context binary"
+        # assume we only have one graph in current context
+        graph_name = qnn_mgr.GetGraphNames()[0]
+        qnn_mgr.AllocateTensor(graph_name)
+        inputs = build_tensor(qnn_mgr.GetGraphInputs(graph_name), dtype_map)
+        outputs = build_tensor(qnn_mgr.GetGraphOutputs(graph_name), dtype_map)
+        qnn_mgr.Destroy()
+
     # generate graph specific for loading context
     bundle_prog = build_graph(inputs, outputs)
     bundle_prog.update({"inputs": inputs, "outputs": outputs})
-    for n in bundle_prog["edge_program"].graph_module.graph.nodes:
-        if op_name in n.name:
+    edge_prog_mgr = to_edge(
+        programs={graph_name: bundle_prog["exported_program"]},
+        # do not alter name for custom op
+        compile_config=EdgeCompileConfig(_use_edge_ops=False),
+    )
+    # update meta with context binary
+    for n in edge_prog_mgr._edge_programs[graph_name].graph.nodes:
+        if n.op == "call_function" and OpContextLoader.namespace in str(n.target):
             n.meta[OpContextLoader.meta_ctx_bin] = ctx_bin
             break
+
+    bundle_prog["edge_program_manager"] = edge_prog_mgr.to_backend(
+        QnnPartitioner(compiler_specs)
+    )
     return bundle_prog
 
 
@@ -712,15 +756,59 @@ def draw_graph(title, path, graph_module: torch.fx.GraphModule):
         f.write(graph.get_dot_graph().create_svg())
 
 
-def generate_qnn_executorch_option(
+def generate_multi_graph_program(
     compiler_specs: List[CompileSpec],
-) -> bytes:
-    for compiler_spec in compiler_specs:
-        if compiler_spec.key == QCOM_QNN_COMPILE_SPEC:
-            qnn_compile_spec_buffer = compiler_spec.value
-        else:
-            raise ValueError(f"unknown compiler spec key value: {compiler_spec.key}")
-    return qnn_compile_spec_buffer
+    processed_bytes: List[bytes],
+    backend_config: ExecutorchBackendConfig = None,
+) -> ExecutorchProgramManager:
+    # compile multiple graphs in qcir into single context binary
+    graph_inputs, graph_outputs = {}, {}
+    qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+        generate_qnn_executorch_option(compiler_specs), processed_bytes
+    )
+    assert qnn_mgr.Init().value == 0, "failed to load processed bytes"
+    binary_info = bytes(qnn_mgr.Compile())
+    assert len(binary_info) != 0, "failed to generate QNN context binary"
+    graph_names = qnn_mgr.GetGraphNames()
+    for graph_name in graph_names:
+        graph_inputs[graph_name] = qnn_mgr.GetGraphInputs(graph_name)
+        graph_outputs[graph_name] = qnn_mgr.GetGraphOutputs(graph_name)
+    qnn_mgr.Destroy()
+
+    # build custom ops with different graph signatures
+    compiler_options = flatbuffer_to_option(compiler_specs[0].value)
+    bundle_progs = [
+        from_context_binary(
+            ctx_path=binary_info,
+            op_name=f"loader_{graph_name}",
+            soc_model=compiler_options.soc_info.soc_model,
+            custom_info={
+                "graph_inputs": graph_inputs[graph_name],
+                "graph_outputs": graph_outputs[graph_name],
+                "graph_name": graph_name,
+            },
+        )
+        for graph_name in graph_names
+    ]
+    # leverage ExecutorchProgramManager for generating pte with multi-methods
+    edge_prog_mgr = to_edge(
+        programs={
+            graph_name: bundle_prog["exported_program"]
+            for graph_name, bundle_prog in zip(graph_names, bundle_progs)
+        },
+        # do not alter name for custom op
+        compile_config=EdgeCompileConfig(_use_edge_ops=False),
+    )
+    # restore meta losed in generating EdgeProgramManager
+    for graph_name in graph_names:
+        for n in edge_prog_mgr._edge_programs[graph_name].graph.nodes:
+            if graph_name in n.name:
+                n.meta[OpContextLoader.meta_ctx_bin] = binary_info
+                break
+
+    return edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs)).to_executorch(
+        config=backend_config or ExecutorchBackendConfig()
+    )
 
 
 def generate_htp_compiler_spec(
@@ -773,6 +861,8 @@ def generate_qnn_executorch_compiler_spec(
     optrace: bool = False,
     shared_buffer: bool = False,
     is_from_context_binary: bool = False,
+    multiple_graphs: bool = False,
+    graph_name: str = "forward",
 ) -> List[CompileSpec]:
     """
     Helper function generating compiler specs for Qualcomm AI Engine Direct
@@ -798,6 +888,10 @@ def generate_qnn_executorch_compiler_spec(
             profile the performance of each operator with cycle unit.
         shared_buffer: Enables usage of shared buffer between application
             and backend for graph I/O.
+        is_from_context_binary: True if current graph comes from pre-built context binary.
+        multiple_graphs: True if multiple methods are expected to have in single .pte file.
+            Please see test cases for post-processing example.
+        graph_name: Assign unique graph name if 'multiple_graphs' is used.
 
     Returns:
         List[CompileSpec]: Compiler specs for Qualcomm AI Engine Direct.
@@ -820,7 +914,7 @@ def generate_qnn_executorch_compiler_spec(
     qnn_executorch_options = QnnExecuTorchOptions(
         _soc_info_table[soc_model], backend_options
     )
-    qnn_executorch_options.graph_name = "executorch"
+    qnn_executorch_options.graph_name = graph_name
     qnn_executorch_options.log_level = (
         QnnExecuTorchLogLevel.kLogLevelDebug
         if debug
@@ -854,11 +948,15 @@ def generate_qnn_executorch_compiler_spec(
     qnn_executorch_options.shared_buffer = shared_buffer
     qnn_executorch_options.online_prepare = online_prepare
     qnn_executorch_options.is_from_context_binary = is_from_context_binary
+    qnn_executorch_options.multiple_graphs = multiple_graphs
+
+    if multiple_graphs:
+        # enable weight sharing mechanism if multiple graphs appear
+        if backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend:
+            backend_options.htp_options.use_weight_sharing = True
 
     return [
-        CompileSpec(
-            QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(qnn_executorch_options)
-        )
+        CompileSpec(QCOM_QNN_COMPILE_SPEC, option_to_flatbuffer(qnn_executorch_options))
     ]
 
 
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 7235e36681e..7a8af5181aa 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -50,6 +50,7 @@ DEFINE_bool(
     shared_buffer,
     false,
     "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend.");
+DEFINE_uint32(method_index, 0, "Index of methods to be specified.");
 
 DEFINE_string(
     etdump_path,
@@ -145,7 +146,9 @@ int main(int argc, char** argv) {
   const char* model_path = FLAGS_model_path.c_str();
   Result<FileDataLoader> loader = FileDataLoader::from(model_path);
   ET_CHECK_MSG(
-      loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error());
+      loader.ok(),
+      "FileDataLoader::from() failed: 0x%" PRIx32,
+      (int)loader.error());
 
   // Parse the program file. This is immutable, and can also be reused between
   // multiple execution invocations across multiple threads.
@@ -156,10 +159,11 @@ int main(int argc, char** argv) {
   }
   ET_LOG(Info, "Model file %s is loaded.", model_path);
 
-  // Use the first method in the program.
+  // Use the designated method in the program, default to the first one
   const char* method_name = nullptr;
   {
-    const auto method_name_result = program->get_method_name(0);
+    const auto method_name_result =
+        program->get_method_name(FLAGS_method_index);
     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
     method_name = *method_name_result;
   }
@@ -233,7 +237,7 @@ int main(int argc, char** argv) {
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
       method_name,
-      method.error());
+      (int)method.error());
   ET_LOG(Info, "Method loaded.");
 
   void* debug_buffer;
@@ -272,7 +276,7 @@ int main(int argc, char** argv) {
         custom_mem_ptr->GetPtr(),
         const_cast<TensorImpl::DimOrderType*>(tensor_meta->dim_order().data()));
     Error ret = method->set_input(Tensor(&impl), input_index);
-    ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret);
+    ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", (int)ret);
   }
   for (int output_index = 0; output_index < method->outputs_size();
        ++output_index) {
@@ -292,7 +296,9 @@ int main(int argc, char** argv) {
       // This can error if the outputs are already pre-allocated. Ignore
       // this error because it doesn't affect correctness, but log it.
       ET_LOG(
-          Info, "ignoring error from set_output_data_ptr(): 0x%" PRIx32, ret);
+          Info,
+          "ignoring error from set_output_data_ptr(): 0x%" PRIx32,
+          (int)ret);
     }
   }
   ET_LOG(Info, "Inputs prepared.");
@@ -364,7 +370,8 @@ int main(int argc, char** argv) {
             const_cast<TensorImpl::DimOrderType*>(
                 tensor_meta->dim_order().data()));
         Error ret = method->set_input(Tensor(&impl), input_index);
-        ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", ret);
+        ET_CHECK_MSG(
+            ret == Error::Ok, "Failed to set input tensor: %d", (int)ret);
       }
 
       Error status = Error::Ok;
@@ -398,7 +405,7 @@ int main(int argc, char** argv) {
           status == Error::Ok,
           "Execution of method %s failed with status 0x%" PRIx32,
           method_name,
-          status);
+          (int)status);
 
       std::vector<EValue> outputs(method->outputs_size());
       status = method->get_outputs(outputs.data(), method->outputs_size());
@@ -443,7 +450,7 @@ int main(int argc, char** argv) {
         status == Error::Ok,
         "Execution of method %s failed with status 0x%" PRIx32,
         method_name,
-        status);
+        (int)status);
     ET_LOG(Info, "Model executed successfully.");
   }
 
diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
index 875752728e6..b4a9a60c20a 100755
--- a/examples/qualcomm/oss_scripts/llama2/llama.py
+++ b/examples/qualcomm/oss_scripts/llama2/llama.py
@@ -17,9 +17,7 @@
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py
index 20d674888da..d277c9b4e77 100755
--- a/examples/qualcomm/oss_scripts/llama3_2/llama.py
+++ b/examples/qualcomm/oss_scripts/llama3_2/llama.py
@@ -24,9 +24,7 @@
 )
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
index c54b75a6b6a..3b4d6c7cbff 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
+++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
@@ -8,16 +8,14 @@
 import os
 from multiprocessing.connection import Client
 
-import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
 import torch
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (  # noqa: F401
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.utils import (
+    ExecutorchBackendConfig,
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
-    generate_qnn_executorch_option,
+    get_soc_to_chipset_map,
 )
 from executorch.examples.qualcomm.qaihub_scripts.utils.utils import (
     gen_pte_from_ctx_bin,
@@ -27,6 +25,7 @@
     setup_common_args_and_variables,
     SimpleADB,
 )
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 
 
 def main(args):
@@ -66,13 +65,27 @@ def main(args):
 
     if args.pre_gen_pte is None:
         # create custom operators as context loader
+        soc_model = get_soc_to_chipset_map()[args.model]
         bundle_programs = [
-            from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}")
+            from_context_binary(
+                ctx_path=f"{args.context_binaries}/{target}",
+                op_name=f"ctx_loader_{i}",
+                soc_model=soc_model,
+            )
             for i, target in enumerate(target_names)
         ]
         pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))]
+        memory_planning_pass = MemoryPlanningPass(
+            alloc_graph_input=False,
+            alloc_graph_output=False,
+        )
         pte_files = gen_pte_from_ctx_bin(
-            args.artifact, pte_names, compiler_specs, bundle_programs
+            artifact=args.artifact,
+            pte_names=pte_names,
+            bundle_programs=bundle_programs,
+            backend_config=ExecutorchBackendConfig(
+                memory_planning_pass=memory_planning_pass
+            ),
         )
     else:
         pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)]
@@ -80,19 +93,6 @@ def main(args):
     if args.compile_only:
         return
 
-    def get_logit_encoding(path_to_last_shard: str):
-        with open(f"{args.context_binaries}/{path_to_last_shard}", "rb") as f:
-            ctx_bin = f.read()
-            qnn_mgr = PyQnnManagerAdaptor.QnnManager(
-                generate_qnn_executorch_option(compiler_specs), ctx_bin
-            )
-            assert qnn_mgr.Init().value == 0, "failed to load context binary"
-            qnn_mgr.AllocateTensor()
-            logits = qnn_mgr.GetGraphOutputs()[-1]
-            encoding = logits.GetEncodings()
-            qnn_mgr.Destroy()
-            return encoding.data["scale"].item(), encoding.data["offset"].item()
-
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=args.build_folder,
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
index 9acbeebef2d..7607c476051 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
+++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
@@ -9,14 +9,14 @@
 from multiprocessing.connection import Client
 
 import torch
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (  # noqa: F401
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 
 from executorch.backends.qualcomm.utils.utils import (
+    ExecutorchBackendConfig,
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
+    get_soc_to_chipset_map,
 )
 from executorch.examples.qualcomm.qaihub_scripts.utils.utils import (
     gen_pte_from_ctx_bin,
@@ -26,6 +26,7 @@
     setup_common_args_and_variables,
     SimpleADB,
 )
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 
 
 def main(args):
@@ -58,22 +59,34 @@ def main(args):
         pte_name = "qaihub_llama3_8b_prompt"
         last_shard_num_inputs = 4
         last_shard_num_outputs = 65
-        custom_spill_fill = 128974848
     else:
         pte_name = "qaihub_llama3_8b_token"
         last_shard_num_inputs = 68
         last_shard_num_outputs = 65
-        custom_spill_fill = 3932160
 
     if args.pre_gen_pte is None:
         # create custom operators as context loader
+        soc_model = get_soc_to_chipset_map()[args.model]
         bundle_programs = [
-            from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}")
+            from_context_binary(
+                ctx_path=f"{args.context_binaries}/{target}",
+                op_name=f"ctx_loader_{i}",
+                soc_model=soc_model,
+            )
             for i, target in enumerate(target_names)
         ]
         pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))]
+        memory_planning_pass = MemoryPlanningPass(
+            alloc_graph_input=False,
+            alloc_graph_output=False,
+        )
         pte_files = gen_pte_from_ctx_bin(
-            args.artifact, pte_names, compiler_specs, bundle_programs, custom_spill_fill
+            artifact=args.artifact,
+            pte_names=pte_names,
+            bundle_programs=bundle_programs,
+            backend_config=ExecutorchBackendConfig(
+                memory_planning_pass=memory_planning_pass
+            ),
         )
     else:
         pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(5)]
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
index 9dc1ee7e254..9ee7551650a 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
@@ -29,7 +29,11 @@ Memory::Memory(
       input_tensors_(modules.size()),
       output_tensors_(modules.size()),
       pos_embs_path_(pos_embs_path),
-      modules_(modules) {}
+      modules_(modules) {
+  for (std::shared_ptr<Module>& module : modules_) {
+    method_names_.emplace_back(*module->method_names()->begin());
+  }
+}
 
 Memory::~Memory() {}
 
@@ -436,7 +440,9 @@ void KVCachedMemory::update_io(
             int index = (cache_stride << 1) + (cache_group << 5) + head;
             ET_CHECK_MSG(
                 modules_[shard]->set_output(
-                    output_tensors[shard][index], index) == Error::Ok,
+                    method_names_[shard],
+                    output_tensors[shard][index],
+                    index) == Error::Ok,
                 "failed to set output tensor for module %d's %d'th output "
                 "while updating kv_cache output tensors",
                 shard,
@@ -458,7 +464,8 @@ void KVCachedMemory::update_io(
     for (int shard = 0; shard < output_tensors.size(); shard++) {
       for (int index = 0; index < output_tensors[shard].size(); index++) {
         ET_CHECK_MSG(
-            modules_[shard]->set_output(output_tensors[shard][index], index) ==
+            modules_[shard]->set_output(
+                method_names_[shard], output_tensors[shard][index], index) ==
                 Error::Ok,
             "failed to set output tensor for module %d's %d'th output "
             "while updating kv_cache output tensors",
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
index 4ad7264cc91..445be2ed21a 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
@@ -52,6 +52,7 @@ class Memory {
   std::vector<std::vector<executorch::aten::TensorImpl*>> output_tensors_;
   std::vector<std::string> pos_embs_path_;
   std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
+  std::vector<std::string> method_names_;
 };
 
 class BertMemory : public Memory {
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
index 959f6810ae5..4bddb32b53e 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
@@ -115,7 +115,8 @@ Error Runner::load() {
     return Error::Ok;
   }
   for (std::shared_ptr<Module>& module : modules_) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward"));
+    method_names_.emplace_back(*module->method_names()->begin());
+    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(method_names_.back()));
   }
 
   // create sampler
@@ -160,7 +161,8 @@ int32_t Runner::logitsToToken(const Tensor& logits_tensor) {
 
 void Runner::run_model_step(std::vector<std::vector<EValue>>& inputs) {
   for (size_t i = 0, num_modules = modules_.size(); i < num_modules; ++i) {
-    Result<std::vector<EValue>> outputs_res = modules_[i]->forward(inputs[i]);
+    Result<std::vector<EValue>> outputs_res =
+        modules_[i]->execute(method_names_[i], inputs[i]);
     ET_CHECK_MSG(
         outputs_res.error() == Error::Ok, "shard %zu inference failed", i);
   }
@@ -185,7 +187,8 @@ Error Runner::generate(
       output_tensors.emplace_back(io_mem_->get_output_tensors(i));
       for (size_t j = 0; j < output_tensors[i].size(); ++j) {
         ET_CHECK_MSG(
-            modules_[i]->set_output(output_tensors[i][j], j) == Error::Ok,
+            modules_[i]->set_output(
+                method_names_[i], output_tensors[i][j], j) == Error::Ok,
             "failed to set output tensor for module %d's %zu'th output",
             i,
             j);
@@ -407,8 +410,8 @@ std::string statsToJsonString(const Runner::Stats& stats) {
 std::vector<Result<MethodMeta>> Runner::get_methods_meta() {
   std::vector<Result<MethodMeta>> methods_meta;
   methods_meta.reserve(modules_.size());
-  for (std::shared_ptr<Module>& module : modules_) {
-    methods_meta.emplace_back(module->method_meta("forward"));
+  for (size_t i = 0; i < modules_.size(); ++i) {
+    methods_meta.emplace_back(modules_[i]->method_meta(method_names_[i]));
   }
   return methods_meta;
 }
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
index 0d15114bc64..be9af7e2275 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
@@ -98,6 +98,7 @@ class Runner {
   const int32_t max_seq_len_;
   int32_t eval_mode_;
   std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
+  std::vector<std::string> method_names_;
   std::string tokenizer_path_;
   float temperature_;
   std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py
index defce876ba0..8e56ce11e2e 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py
@@ -13,14 +13,14 @@
 import torch
 from diffusers import EulerDiscreteScheduler, UNet2DConditionModel
 from diffusers.models.embeddings import get_timestep_embedding
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QcomChipset,
-)
 
 from executorch.backends.qualcomm.utils.utils import (
+    ExecutorchBackendConfig,
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
+    get_soc_to_chipset_map,
+    QcomChipset,
 )
 
 from executorch.examples.qualcomm.qaihub_scripts.stable_diffusion.stable_diffusion_lib import (
@@ -34,6 +34,7 @@
     setup_common_args_and_variables,
     SimpleADB,
 )
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from PIL import Image
 from torchvision.transforms import ToTensor
 
@@ -353,7 +354,6 @@ def post_process_vae():
 
 def main(args):
     os.makedirs(args.artifact, exist_ok=True)
-
     # common part for compile & inference
     backend_options = generate_htp_compiler_spec(
         use_fp16=False,
@@ -367,14 +367,24 @@ def main(args):
 
     if args.pre_gen_pte is None:
         # Create custom operators as context loader
+        soc_model = get_soc_to_chipset_map()[args.model]
         bundle_programs = [
-            from_context_binary(args.text_encoder_bin, "ctx_loader_0"),
-            from_context_binary(args.unet_bin, "ctx_loader_1"),
-            from_context_binary(args.vae_bin, "ctx_loader_2"),
+            from_context_binary(args.text_encoder_bin, "ctx_loader_0", soc_model),
+            from_context_binary(args.unet_bin, "ctx_loader_1", soc_model),
+            from_context_binary(args.vae_bin, "ctx_loader_2", soc_model),
         ]
         pte_names = [f"{args.pte_prefix}_{target_name}" for target_name in target_names]
+        memory_planning_pass = MemoryPlanningPass(
+            alloc_graph_input=False,
+            alloc_graph_output=False,
+        )
         pte_files = gen_pte_from_ctx_bin(
-            args.artifact, pte_names, compiler_specs, bundle_programs
+            artifact=args.artifact,
+            pte_names=pte_names,
+            bundle_programs=bundle_programs,
+            backend_config=ExecutorchBackendConfig(
+                memory_planning_pass=memory_planning_pass
+            ),
         )
         assert (
             len(pte_files) == 3
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
index cc54a801737..585d58b21ee 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
@@ -75,8 +75,8 @@ Runner::Runner(
 
 std::vector<Result<MethodMeta>> Runner::get_methods_meta() {
   std::vector<Result<MethodMeta>> methods_meta;
-  for (std::unique_ptr<Module>& module : modules_) {
-    methods_meta.emplace_back(module->method_meta("forward"));
+  for (size_t i = 0; i < modules_.size(); ++i) {
+    methods_meta.emplace_back(modules_[i]->method_meta(method_names_[i]));
   }
   return methods_meta;
 }
@@ -95,7 +95,8 @@ Error Runner::load() {
   }
   stats_.model_load_start_ms = time_in_ms();
   for (auto& module : modules_) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward"));
+    method_names_.emplace_back(*module->method_names()->begin());
+    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(method_names_.back()));
   }
   stats_.model_load_end_ms = time_in_ms();
   return Error::Ok;
@@ -378,13 +379,14 @@ Error Runner::generate(std::string prompt) {
       uncond_emb_vec.data(),
       {1, 77, 1024},
       encoder_method_meta.output_tensor_meta(0)->scalar_type());
-  modules_[0]->set_output(cond_emb_tensor);
+  auto ret = modules_[0]->set_output(method_names_[0], cond_emb_tensor);
   long encoder_start = time_in_ms();
-  auto cond_res = modules_[0]->forward(cond_tokens_tensor);
+  auto cond_res = modules_[0]->execute(method_names_[0], cond_tokens_tensor);
   stats_.text_encoder_execution_time += (time_in_ms() - encoder_start);
-  modules_[0]->set_output(uncond_emb_tensor);
+  ret = modules_[0]->set_output(method_names_[0], uncond_emb_tensor);
   encoder_start = time_in_ms();
-  auto uncond_res = modules_[0]->forward(uncond_tokens_tensor);
+  auto uncond_res =
+      modules_[0]->execute(method_names_[0], uncond_tokens_tensor);
   stats_.text_encoder_execution_time += (time_in_ms() - encoder_start);
 
   // Initialize unet parameters
@@ -467,15 +469,17 @@ Error Runner::generate(std::string prompt) {
 
     stats_.unet_aggregate_post_processing_time +=
         (time_in_ms() - start_post_process);
-    modules_[1]->set_output(noise_pred_text_tensor);
+    ret = modules_[1]->set_output(method_names_[1], noise_pred_text_tensor);
     long start_unet_execution = time_in_ms();
-    auto cond_res = modules_[1]->forward(
+    auto cond_res = modules_[1]->execute(
+        method_names_[1],
         {latent_tensor, time_emb_tensors[step_index], cond_emb_tensor});
     stats_.unet_aggregate_execution_time +=
         (time_in_ms() - start_unet_execution);
-    modules_[1]->set_output(noise_pred_uncond_tensor);
+    ret = modules_[1]->set_output(method_names_[1], noise_pred_uncond_tensor);
     start_unet_execution = time_in_ms();
-    auto uncond_res = modules_[1]->forward(
+    auto uncond_res = modules_[1]->execute(
+        method_names_[1],
         {latent_tensor,
          time_emb_tensors[step_index],
          uncond_emb_tensor}); // results in noise_pred_uncond_vec
@@ -524,9 +528,9 @@ Error Runner::generate(std::string prompt) {
 
   quant_tensor(latent, vae_input, vae_input_scale_, vae_input_offset_);
 
-  modules_[2]->set_output(output_tensor);
+  ret = modules_[2]->set_output(method_names_[2], output_tensor);
   long start_vae_execution = time_in_ms();
-  auto vae_res = modules_[2]->forward(vae_input_tensor);
+  auto vae_res = modules_[2]->execute(method_names_[2], vae_input_tensor);
   stats_.vae_execution_time = (time_in_ms() - start_vae_execution);
   stats_.generate_end_ms = time_in_ms();
 
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h
index f91efd5b832..e49201bca25 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h
@@ -112,6 +112,7 @@ class Runner {
  private:
   Stats stats_;
   std::vector<std::unique_ptr<executorch::extension::Module>> modules_;
+  std::vector<std::string> method_names_;
   std::vector<std::vector<uint16_t>> time_emb_list_;
   std::unordered_map<std::string, int32_t> vocab_to_token_map_;
 
diff --git a/examples/qualcomm/qaihub_scripts/utils/export.py b/examples/qualcomm/qaihub_scripts/utils/export.py
index b742f59f1d4..4d252175dbb 100644
--- a/examples/qualcomm/qaihub_scripts/utils/export.py
+++ b/examples/qualcomm/qaihub_scripts/utils/export.py
@@ -15,18 +15,17 @@
 import numpy as np
 
 import torch
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.utils import (
     draw_graph,
+    ExecutorchBackendConfig,
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
     generate_qnn_executorch_option,
 )
+from executorch.examples.qualcomm.qaihub_scripts.utils.utils import preprocess_binary
 from executorch.examples.qualcomm.utils import make_output_dir, SimpleADB
-from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 
 
@@ -45,7 +44,7 @@ def get_logger():
     return logging.LoggerAdapter(logger, extra={"prefix": "UTILS.EXPORT"})
 
 
-def get_io_info(prog_info, ctx_bin_path, compiler_spec):
+def get_io_info(prog_info, ctx_bin_path, compiler_specs):
     def fill_tensor_info(info, qnn_tensors, category):
         # fetch related IO info stored in prog_info
         for i, (name, tensor) in enumerate(prog_info[category].items()):
@@ -70,15 +69,16 @@ def fill_tensor_info(info, qnn_tensors, category):
     tensor_info = {in_key: [], out_key: []}
 
     with open(ctx_bin_path, "rb") as f:
-        ctx_bin = f.read()
+        ctx_bin = preprocess_binary(f.read(), compiler_specs)
         # leverage QNN pybind interface to retrieve tensor encodings
         qnn_mgr = PyQnnManagerAdaptor.QnnManager(
-            generate_qnn_executorch_option(compiler_spec), ctx_bin
+            generate_qnn_executorch_option(compiler_specs), ctx_bin
         )
         assert qnn_mgr.Init().value == 0, "failed to load context binary"
-        qnn_mgr.AllocateTensor()
-        fill_tensor_info(tensor_info, qnn_mgr.GetGraphInputs(), in_key)
-        fill_tensor_info(tensor_info, qnn_mgr.GetGraphOutputs(), out_key)
+        graph_name = qnn_mgr.GetGraphNames()[0]
+        qnn_mgr.AllocateTensor(graph_name)
+        fill_tensor_info(tensor_info, qnn_mgr.GetGraphInputs(graph_name), in_key)
+        fill_tensor_info(tensor_info, qnn_mgr.GetGraphOutputs(graph_name), out_key)
         qnn_mgr.Destroy()
 
     return tensor_info
@@ -250,28 +250,24 @@ def compile(args):
             postfix += 1
             custom_op_name = f"{custom_op_name}_{postfix}"
         name_map[custom_op_name] = postfix
-        # step 1: generate ExportedProgram with custom op as binary loader
+        # step 1: generate ExportedProgram with custom op as binary loader & lower to QnnBackend
         logger.info(f"({index}/{num_bins}) exporting program for {ctx_bin}")
         prog_info = from_context_binary(
             ctx_bin, custom_op_name, getattr(QcomChipset, args.model)
         )
-        # step 2: lower to QnnBackend
-        logger.info(f"({index}/{num_bins}) start lowering {ctx_bin} to QnnBackend")
-        lowered_module = to_backend(
-            "QnnBackend", prog_info["edge_program"], compiler_specs
-        )
-        # step 3: write pte files and IO information
+        # step 2: write pte files and IO information
         logger.info(f"({index}/{num_bins}) exporting {binary_name}.pte")
         with open(f"{output_dir}/{binary_name}.pte", "wb") as f:
-            f.write(
-                lowered_module.buffer(
-                    extract_delegate_segments=True, memory_planning=memory_planning_pass
+            prog_info["edge_program_manager"].to_executorch(
+                config=ExecutorchBackendConfig(
+                    memory_planning_pass=memory_planning_pass
                 )
-            )
+            ).write_to_file(f)
+
         logger.info(
             f"({index}/{num_bins}) exporting network graph with {binary_name}.svg"
         )
-        draw_graph(binary_name, output_dir, prog_info["edge_program"].graph_module)
+        draw_graph(binary_name, output_dir, prog_info["exported_program"].graph_module)
         logger.info(
             f"({index}/{num_bins}) exporting graph description with {binary_name}.json"
         )
diff --git a/examples/qualcomm/qaihub_scripts/utils/utils.py b/examples/qualcomm/qaihub_scripts/utils/utils.py
index ad55d7fd10b..fc065b79af5 100644
--- a/examples/qualcomm/qaihub_scripts/utils/utils.py
+++ b/examples/qualcomm/qaihub_scripts/utils/utils.py
@@ -9,11 +9,16 @@
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
 
 from executorch.backends.qualcomm.utils.utils import (
-    canonicalize_program,
     generate_qnn_executorch_option,
+    update_spill_fill_size,
 )
-from executorch.exir.backend.backend_api import to_backend
-from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
+
+
+def preprocess_binary(ctx_bin, compiler_specs):
+    qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+        generate_qnn_executorch_option(compiler_specs),
+    )
+    return bytes(qnn_mgr.MakeBinaryInfo(ctx_bin))
 
 
 def get_encoding(
@@ -26,16 +31,17 @@ def get_encoding(
 ):
     encoding_list = []
     with open(path_to_shard, "rb") as f:
-        ctx_bin = f.read()
+        ctx_bin = preprocess_binary(f.read(), compiler_specs)
         qnn_mgr = PyQnnManagerAdaptor.QnnManager(
             generate_qnn_executorch_option(compiler_specs), ctx_bin
         )
         assert qnn_mgr.Init().value == 0, "failed to load context binary"
-        qnn_mgr.AllocateTensor()
+        graph_name = qnn_mgr.GetGraphNames()[0]
+        qnn_mgr.AllocateTensor(graph_name)
         if get_input:
             encoding_input = {"scale": [], "offset": []}
             for i in range(num_input):
-                inputs = qnn_mgr.GetGraphInputs()[i]
+                inputs = qnn_mgr.GetGraphInputs(graph_name)[i]
                 encoding = inputs.GetEncodings()
                 encoding_input["scale"].append(encoding.data["scale"].item())
                 encoding_input["offset"].append(encoding.data["offset"].item())
@@ -43,7 +49,7 @@ def get_encoding(
         if get_output:
             encoding_output = {"scale": [], "offset": []}
             for i in range(num_output):
-                outputs = qnn_mgr.GetGraphOutputs()[i]
+                outputs = qnn_mgr.GetGraphOutputs(graph_name)[i]
                 encoding = outputs.GetEncodings()
                 encoding_output["scale"].append(encoding.data["scale"].item())
                 encoding_output["offset"].append(encoding.data["offset"].item())
@@ -52,35 +58,25 @@ def get_encoding(
     return encoding_list
 
 
-def gen_pte_from_ctx_bin(
-    artifact, pte_names, compiler_specs, bundle_programs, custom_spill_fill=None
-):
-
-    # Lower with QnnBackend
-    lowered_modules = [
-        to_backend("QnnBackend", prog["edge_program"], compiler_specs)
-        for prog in bundle_programs
-    ]
+def gen_pte_from_ctx_bin(artifact, pte_names, bundle_programs, backend_config):
+    edge_prog_mgrs = [prog["edge_program_manager"] for prog in bundle_programs]
     # Setup spill-fill buffer for relieving runtime memory usage
-    canonicalize_program(lowered_modules, custom_buffer_size=custom_spill_fill)
-    # export pte files
+    update_spill_fill_size(
+        [
+            prog_mgr._edge_programs[list(prog_mgr.methods)[0]]
+            for prog_mgr in edge_prog_mgrs
+        ]
+    )
+    # Export pte files
     pte_files = []
     for pte_name in pte_names:
         print(f"{pte_name} generating...")
-        memory_planning_pass = MemoryPlanningPass(
-            alloc_graph_input=False,
-            alloc_graph_output=False,
-        )
         pte_files.append(f"{artifact}/{pte_name}.pte")
-        with open(pte_files[-1], "wb") as file:
-            file.write(
-                lowered_modules[0].buffer(
-                    extract_delegate_segments=True, memory_planning=memory_planning_pass
-                )
-            )
+        with open(pte_files[-1], "wb") as f:
+            edge_prog_mgrs[0].to_executorch(config=backend_config).write_to_file(f)
         # GC for reducing host memory consuming
         bundle_programs.pop(0)
-        lowered_modules.pop(0)
+        edge_prog_mgrs.pop(0)
         gc.collect()
 
     return pte_files
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index 56169e39a2e..7445ba4a5ec 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -5,9 +5,7 @@
 import torch
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
     generate_htp_compiler_spec,
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 573e23640b2..8051d157166 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -12,9 +12,7 @@
 
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.utils import (
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 52c263f0a4b..dc517764f8d 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -17,9 +17,7 @@
 import torch
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype
-from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-    QcomChipset,
-)
+from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
     generate_htp_compiler_spec,
@@ -138,7 +136,7 @@ def push(self, inputs=None, input_list=None, files=None):
             for file_name in files:
                 self._adb(["push", file_name, self.workspace])
 
-    def execute(self, custom_runner_cmd=None):
+    def execute(self, custom_runner_cmd=None, method_index=0):
         self._adb(["shell", f"mkdir -p {self.output_folder}"])
         # run the delegation
         if custom_runner_cmd is None:
@@ -155,6 +153,7 @@ def execute(self, custom_runner_cmd=None):
                         if self.dump_intermediate_outputs
                         else ""
                     ),
+                    f"--method_index {method_index}",
                 ]
             )
             qnn_executor_runner_cmds = " ".join(

From 8437526e779c99f2788cd4611d405735e9b36ea7 Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Sun, 17 Nov 2024 23:28:03 +0800
Subject: [PATCH 2/3] fix import error in partitioner_lib.py

---
 extension/llm/export/partitioner_lib.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index 6f4b95e3d08..9920da9574c 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -160,10 +160,8 @@ def get_qnn_partitioner(
             QnnPartitioner,
         )
 
-        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.serialization.qnn_compile_spec_schema`
-        from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
-            QcomChipset,
-        )
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.serialization.qc_schema`
+        from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
         from executorch.backends.qualcomm.utils.utils import (

From 9032c34413ec7a0fd1d40c73739d44afdadd4b9f Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Mon, 18 Nov 2024 13:52:32 +0800
Subject: [PATCH 3/3] apply bzl changes

---
 backends/qualcomm/aot/python/targets.bzl |  1 +
 backends/qualcomm/runtime/targets.bzl    |  2 ++
 backends/qualcomm/targets.bzl            | 33 ++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/backends/qualcomm/aot/python/targets.bzl b/backends/qualcomm/aot/python/targets.bzl
index e1f5a6a8fc5..8eb8d095c30 100644
--- a/backends/qualcomm/aot/python/targets.bzl
+++ b/backends/qualcomm/aot/python/targets.bzl
@@ -31,6 +31,7 @@ def define_common_targets():
             "//executorch/backends/qualcomm/aot/wrappers:wrappers",
             "//executorch/backends/qualcomm/runtime:logging",
             "//executorch/backends/qualcomm:schema",
+            "//executorch/backends/qualcomm:qc_binary_info_schema",
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl
index f7a3e220dee..73d333f52dd 100644
--- a/backends/qualcomm/runtime/targets.bzl
+++ b/backends/qualcomm/runtime/targets.bzl
@@ -29,6 +29,7 @@ def define_common_targets():
         ],
         exported_deps = [
             "//executorch/backends/qualcomm:schema",
+            "//executorch/backends/qualcomm:qc_binary_info_schema",
             "//executorch/runtime/core:core",
         ],
     )
@@ -63,6 +64,7 @@ def define_common_targets():
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
             ":logging",
             "//executorch/backends/qualcomm:schema",
+            "//executorch/backends/qualcomm:qc_binary_info_schema",
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/aot/wrappers:wrappers",
             "//executorch/runtime/backend:interface",
diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl
index 08d163eefc3..14e02989e5c 100644
--- a/backends/qualcomm/targets.bzl
+++ b/backends/qualcomm/targets.bzl
@@ -16,6 +16,12 @@ SCHEMA_GEN_RULE_NAME = "qc_compiler_spec_generated"
 
 SCHEMA_LIRRARY_NAME = SCHEMA_NAME
 
+QC_BINARY_INFO_SCHEMA = "qc_binary_info"
+QC_BINARY_INFO_INPUT_SCHEMA = "serialization/" + QC_BINARY_INFO_SCHEMA + ".fbs"
+QC_BINARY_INFO_SCHEMA_GEN_RULE_NAME = QC_BINARY_INFO_SCHEMA + "_generated"
+QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER = QC_BINARY_INFO_SCHEMA_GEN_RULE_NAME + ".h"
+QC_BINARY_INFO_SCHEMA_LIRRARY_NAME = QC_BINARY_INFO_SCHEMA
+
 def generate_schema_header(rule_name, srcs, headers, default_header):
     """Generate header file given flatbuffer schema
     """
@@ -77,6 +83,33 @@ def define_common_targets():
         platforms = [ANDROID],
     )
 
+    generate_schema_header(
+        QC_BINARY_INFO_SCHEMA_GEN_RULE_NAME,
+        [QC_BINARY_INFO_INPUT_SCHEMA],
+        [QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER],
+        QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER,
+    )
+
+    runtime.cxx_library(
+        name = "qc_binary_info_schema",
+        srcs = [],
+        visibility = [
+            # Lock this down as tightly as possible to ensure that flatbuffers
+            # are an implementation detail. Ideally this list would only include
+            # //executorch/runtime/executor/...
+            "//executorch/codegen/tools/...",
+            "//executorch/runtime/executor/...",
+            "//executorch/backends/qualcomm/...",
+            "//executorch/backends/qualcomm/runtime/...",
+        ],
+        exported_headers = {
+             QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER: ":{}[{}]".format( QC_BINARY_INFO_SCHEMA_GEN_RULE_NAME,  QC_BINARY_INFO_OUTPUT_SCHEMA_HEADER),
+        },
+        exported_external_deps = ["flatbuffers-api"],
+        define_static_target = True,
+        platforms = [ANDROID],
+    )
+
     runtime.cxx_library(
         name = "qnn_executorch_backend",
         srcs = [],