[XNNPACK][Weights Cache] Enable in XNNPACK

mcr229 · mcr229 · commit 2adb3268e915 · 2025-03-14T09:36:22.000-07:00
Pull Request resolved: #9155 We enable the XNNPACK Weights cache in XNNPACK. the weights cache is initialized for the runtime with the named data map and a memory allocator (for now the memory allocator is not used, but i hope in the future this can be used to managed the memory for packed weights). Before Creating the runtime, we first initialize the weights cache, this sets the finalization state to false. As we add weight/bias tensors to the graph, we load them through the named data map in the weights cache, and keep a map of the pointer to the name. When XNNPACK Creates the runtime and packs the weights, it uses the weights_cache method look_up_or_insert. We use the pointers provided in the cache key to look up their names and append them together like ("weightsbias"). We then insert the packed weights with that key. In future look ups, we just use the pointer cached at the named pack tensor key, saving us from packing in the future. After creating the runtime and packing the weights, we finalize the cache. This sets is_finalized to true. We also free all unpacked buffers loaded from the named data map as they are no longer needed. We also keep reference counts for all the packed weights incrementing the packed weights which were used by this runtime. We return a vector of all the packed weight names to the xnn_executor runner. When the XNNExecutor is destroyed, we decrement the counts of the packed buffers and destroy them if necessary. Note that this feature is gated behind the XNN_ENABLE_WEIGHTS_CACHE flag. Since the weights_cache is a global member of the singleton xnnpack backend class, and it is also read/write, we add a mutex to ensure that access to the weights_cache is thread safe. We added a new mutex, so the mutex hiearchy is: workspace_mutex_ -> weights_cache_mutex_ ghstack-source-id: 271809922 @exported-using-ghexport Internal: I ran a simple experiment with the Machine translation model. I loaded encode_first executed it, and then loaded forward and executed (both methods staying in memory). I measured the RSS after Encode First and then measured the RSS after forward. We saw the following results: | | RSS after Encode First (MiB) | RSS after Forward (MiB) | |----------------------|------------------------------|-------------------------| | Without Weight Cache | 62.765625 | 130.019531 | | With Weight Cache | 62.789062 | 93.222656 | Which shows that with the weight cache and two methods, we can see around ~28% reduction in memory usage Differential Revision: [D70885926](https://our.internmc.facebook.com/intern/diff/D70885926/)
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
@@ -37,6 +37,19 @@ option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
 # Keeping this OFF by default due to regressions in decode and model load with
 # kleidi kernels
 option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)
+
+# Turning this on cache weights between partitions and methods. If weights
+# are shared across methods/partitions then this can reduce load time and
+# memory usage
+
+# Keeping this off maintains existing behavior. Turning this on serializes
+# execution and initialization of delegates, to be revisited
+option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
+        "Enable weights cache to cache and manage all packed weights" OFF)
+
+if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE)
+  add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE)
+endif()
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -11,7 +11,9 @@
 #include <executorch/backends/xnnpack/serialization/schema_generated.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/executor/pte_data_map.h>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
 #pragma clang diagnostic ignored "-Wglobal-constructors"
@@ -167,7 +169,8 @@ const uint8_t* getConstantDataPtr(
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
     const NamedDataMap* named_data_map,
-    std::vector<FreeableBuffer>& loaded_buffers_from_map) {
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
   auto buffer_idx = tensor_value->constant_buffer_idx();
   if (buffer_idx) {
     if (!constant_data_ptr) {
@@ -187,6 +190,15 @@ const uint8_t* getConstantDataPtr(
         return constant_data_ptr + offset;
       } else {
         const std::string& data_name = constant_data_offset->named_key()->str();
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+        Result<const uint8_t*> data_ptr =
+            weights_cache->load_unpacked_data(data_name);
+        if (!data_ptr.ok()) {
+          ET_LOG(Error, "Failed to load weights from cache");
+          return nullptr;
+        }
+        return data_ptr.get();
+#else
         Result<FreeableBuffer> buffer =
             named_data_map->get_data(data_name.c_str());
         if (!buffer.ok()) {
@@ -198,8 +210,9 @@ const uint8_t* getConstantDataPtr(
         }
         const uint8_t* data_ptr =
             static_cast<const uint8_t*>(buffer.get().data());
-        loaded_buffers_from_map.push_back(std::move(buffer.get()));
+        freeable_buffers.push_back(std::move(buffer.get()));
         return data_ptr;
+#endif
       }
     }
   }
@@ -222,7 +235,8 @@ Error defineTensor(
     std::vector<uint32_t>& output_ids,
     CompileAllocator& allocator,
     const NamedDataMap* named_data_map,
-    std::vector<FreeableBuffer>& loaded_buffers_from_map) {
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -264,7 +278,8 @@ Error defineTensor(
       flatbuffer_graph,
       constant_data_ptr,
       named_data_map,
-      loaded_buffers_from_map);
+      freeable_buffers,
+      weights_cache);
 
   xnn_status status;
   // The type we might have to convert to
@@ -1999,9 +2014,9 @@ ET_NODISCARD Error XNNCompiler::compileModel(
     const void* buffer_pointer,
     size_t num_bytes,
     XNNExecutor* executor,
-    MemoryAllocator* runtime_allocator,
-    const NamedDataMap* named_data_map,
-    xnn_workspace_t workspace) {
+    XNNWeightsCache* weights_cache,
+    xnn_workspace_t workspace,
+    const NamedDataMap* named_data_map) {
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
   const uint8_t* constant_data = nullptr;
@@ -2065,11 +2080,14 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   // Invalid ids do not need to be remapped
   remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);
 
+  // If weight cache is not on we hold onto all the unpacked buffers
+  // and we free them at the end
+  std::vector<FreeableBuffer> unpacked_buffers;
+
   // External Ids for inputs and outputs
   std::vector<uint32_t> input_ids;
   std::vector<uint32_t> output_ids;
   Error err = Error::Ok;
-  std::vector<FreeableBuffer> loaded_buffers_from_map;
   for (auto value : *flatbuffer_graph->xvalues()) {
     err = defineTensor(
         subgraph.get(),
@@ -2081,7 +2099,8 @@ ET_NODISCARD Error XNNCompiler::compileModel(
         output_ids,
         compile_allocator,
         named_data_map,
-        loaded_buffers_from_map);
+        unpacked_buffers,
+        weights_cache);
 
     if (err != Error::Ok) {
       return err;
@@ -2103,20 +2122,34 @@ ET_NODISCARD Error XNNCompiler::compileModel(
 
   xnn_runtime_t runtime_ptr = nullptr;
 
+  // XNNWeightsCache if weights cache is not enabled, then XNNWeightsCache
+  // just manages the unpacked weights until the runtime is created.
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+  ET_CHECK_OR_RETURN_ERROR(
+      unpacked_buffers.size() == 0,
+      Internal,
+      "Weight Cache is enabled, which means unpacked buffers should be owned by the cache");
+  xnn_weights_cache_t weights_cache_ptr =
+      weights_cache->get_num_unpacked_data() > 0 ? weights_cache->get()
+                                                 : nullptr;
+#else
+  xnn_weights_cache_t weights_cache_ptr = nullptr;
+#endif
+
 #ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
   ET_CHECK_OR_RETURN_ERROR(
       workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
   status = xnn_create_runtime_v4(
       subgraph.get(),
-      /*weight_cache=*/nullptr, // TODO - support weight cache
+      weights_cache_ptr,
       workspace,
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
 #else
   status = xnn_create_runtime_v3(
       subgraph.get(),
-      /*weight_cache=*/nullptr, // TODO - support weight cache
+      weights_cache_ptr,
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
@@ -2128,10 +2161,25 @@ ET_NODISCARD Error XNNCompiler::compileModel(
       "XNN Runtime creation failed with code: %s",
       xnn_status_to_string(status));
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+  auto packed_weights_names = weights_cache->finalize_for_runtime();
+  ET_CHECK_OR_RETURN_ERROR(
+      packed_weights_names.ok(),
+      Internal,
+      "Failed to finalize weights cache after creating the xnn runtime")
+#else
+  for (auto& buffer : unpacked_buffers) {
+    buffer.Free();
+  }
+  Result<std::vector<std::string>> packed_weights_names =
+      std::vector<std::string>();
+#endif
+
   err = executor->initialize( // NOLINT: runtime_ptr is non-null
       runtime_ptr,
       std::move(input_ids),
-      std::move(output_ids));
+      std::move(output_ids),
+      std::move(packed_weights_names.get()));
 
   return err;
 };
diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h
@@ -9,11 +9,9 @@
 #pragma once
 
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/runtime/platform/compiler.h>
-
 #include <xnnpack.h>
-#include <memory>
-#include <vector>
 
 namespace executorch {
 namespace backends {
@@ -29,9 +27,9 @@ class XNNCompiler {
       const void* buffer_pointer,
       size_t num_bytes,
       XNNExecutor* executor,
-      executorch::runtime::MemoryAllocator* runtime_allocator,
-      const executorch::runtime::NamedDataMap* named_data_map,
-      xnn_workspace_t workspace);
+      XNNWeightsCache* weights_cache,
+      xnn_workspace_t workspace,
+      const NamedDataMap* named_data_map);
 };
 
 } // namespace delegate
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -30,7 +30,8 @@ using executorch::runtime::kTensorDimensionLimit;
 ET_NODISCARD Error XNNExecutor::initialize(
     xnn_runtime_t runtime,
     std::vector<uint32_t>&& input_ids,
-    std::vector<uint32_t>&& output_ids) {
+    std::vector<uint32_t>&& output_ids,
+    std::vector<std::string>&& packed_data_names) {
   runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
       runtime, xnn_delete_runtime);
 
@@ -51,6 +52,7 @@ ET_NODISCARD Error XNNExecutor::initialize(
   std::sort(output_ids_.begin(), output_ids_.end());
 
   externals_.resize(input_ids_.size() + output_ids_.size());
+  packed_data_names_ = std::move(packed_data_names);
 
   return Error::Ok;
 }
diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h
@@ -34,6 +34,7 @@ class XNNExecutor {
   std::vector<uint32_t> input_ids_;
   std::vector<uint32_t> output_ids_;
   std::vector<xnn_external_value> externals_;
+  std::vector<std::string> packed_data_names_;
 
  public:
   XNNExecutor() = default;
@@ -46,6 +47,10 @@ class XNNExecutor {
     return output_ids_.size();
   }
 
+  inline std::vector<std::string> get_packed_data_names() {
+    return packed_data_names_;
+  }
+
   /**
    * Initialize the XNNExecutor with a given runtime and input/output ids.
    * The input/output ids are expected to be sorted in order of their
@@ -54,7 +59,8 @@ class XNNExecutor {
   ET_NODISCARD executorch::runtime::Error initialize(
       xnn_runtime_t runtime,
       std::vector<uint32_t>&& input_ids,
-      std::vector<uint32_t>&& output_ids);
+      std::vector<uint32_t>&& output_ids,
+      std::vector<std::string>&& packed_data_names);
 
   /**
    * Prepares the arguments for runtime graph execution.
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
@@ -20,6 +21,7 @@
 namespace executorch {
 namespace backends {
 
+using executorch::backends::xnnpack::delegate::XNNWeightsCache;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Backend;
 using executorch::runtime::BackendExecutionContext;
@@ -81,13 +83,18 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
     }
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
-
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-    // This is needed to serialize access to xnn_create_runtime which is not
     // thread safe. This can heppen when multiple threads call init() on
     // the same backend instance.
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
+
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+    const std::lock_guard<std::mutex> lock_weight_cache(weights_cache_mutex_);
+    weights_cache_->initialize_for_runtime(
+        context.get_runtime_allocator(), named_data_map);
+#endif
+
     // Executor has been allocated but not constructed, ensure that runtime_ is
     // nullptr by constructing it in place here. NOTE: Since we use placement
     // new and since this type is not trivially destructible, we must call the
@@ -97,9 +104,9 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
         processed->data(),
         processed->size(),
         executor,
-        context.get_runtime_allocator(),
-        named_data_map,
-        workspace_.get());
+        weights_cache_.get(),
+        workspace_.get(),
+        named_data_map);
     // This backend does not need its processed data after compiling the model.
     processed->Free();
 
@@ -125,6 +132,10 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+    const std::lock_guard<std::mutex> lock_weights_cache(weights_cache_mutex_);
+#endif
+
     // Prepare Inputs/Outputs and Propagate Input Shapes
     Error err = executor->prepare_args(args);
     if (err != Error::Ok) {
@@ -145,16 +156,24 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
 
   void destroy(DelegateHandle* handle) const override {
     if (handle != nullptr) {
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
       // This is needed to serialize access to xnn_delete_runtime which is not
       // thread safe. This can heppen when multiple threads call destroy() on
       // the same backend instance.
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
       const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
+
       auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
+
 #ifdef ENABLE_XNNPACK_PROFILING
       executor->print_avg_op_timings();
 #endif
+
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+      const std::lock_guard<std::mutex> lock_weights_cache(
+          weights_cache_mutex_);
+      weights_cache_->delete_packed_data(executor->get_packed_data_names());
+#endif
       // XNNExecutor is not trivially destructible. Since this was constructed
       // manually in init(), we must destroy it manually here.
       executor->~XNNExecutor();
@@ -167,6 +186,15 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
   std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
       nullptr,
       &xnn_release_workspace};
+
+  // Weights cache is global to all delegate instances.
+  mutable std::mutex weights_cache_mutex_;
+  std::unique_ptr<XNNWeightsCache> weights_cache_ =
+      std::make_unique<XNNWeightsCache>();
+
+  // Lock Hiearchy for Mutexes:
+  // workspace_mutex_
+  // weights_cache_mutex_
 };
 
 namespace {
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
@@ -6,11 +6,15 @@ def _get_preprocessor_flags():
     Disable if someone explictly specified a config option,
     else Enable otherwise
     """
-    if native.read_config("executorch", "xnnpack_workspace_sharing", "0") == "0":
-        return []
+    preprocessor_flags = []
+    if native.read_config("executorch", "xnnpack_workspace_sharing", "0") != "0":
+        preprocessor_flags.append("-DENABLE_XNNPACK_SHARED_WORKSPACE")
+
+    if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0":
+        preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE")
 
     # Enable if not disabled through config
-    return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"]
+    return preprocessor_flags
 
 def define_common_targets():
     runtime.cxx_library(
diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
@@ -74,7 +74,8 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           },
           {
               1,
-          }),
+          },
+          {}),
       Error::Ok);
   TensorFactory<executorch::aten::ScalarType::Int> tf;
   auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});