From ac2258bd8ab19397c444d7f8b5d4d1983f81369d Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Tue, 11 Mar 2025 12:54:28 -0700
Subject: [PATCH] [XNNPACK][Weights Cache] Initial Weights Cache Design with
 NamedDataMap

XNNWeightsCache Design with NamedDataMap. The intent of the weights cache is for tensors to be loaded (via name) through the named data map.

APIs to be used by XNNCompiler:

- load_unpacked_data
    - Takes in a string name (tensor name). The weights cache loads the data for this string from the named data map and returns the pointer. It also creates a mapping of this pointer to the name which is later used by the XNNPACK's internal weight cache implementation

- free_unpacked_data
    - Frees all the unpacked data loaded from NamedDataMap. This is only safe to call after xnn_create_runtime has been called. This is because create_runtime takes unpacked data pointers and packs them into a separate buffer.

- a couple getter methods
    - get_packed_data_names
    - get_unpacked_data_names
    - get_num_packed_data
    - get() (get's the xnn_weights_cache object)


Internal APIs used by XNNPACK Library

- look_up
    - takes a cache key (weight and bias pointers) and looks up the offset to the packed weight if it exists
- look_up_or_insert
    - takes a cache key and pointer to packed weights and looks_up the offset if it exists, or inserts a new packed weight into the cache and returns that offset
- offset_to_addr
    - gets offset and returns address to packed pointer
- reserve_space
    - returns memory address with appropriate sie for XNNPACK to populate with packed weights ( I want to use the runtime_allocator for this but i don't think we have the right sizes, so for now we are just using a string buffer and resizing it)
- is_finalized
     - since this cache doesn't necessarily need to care about a finalized state we always return true.
- delete_cache
    - deletes cache

Differential Revision: [D70885917](https://our.internmc.facebook.com/intern/diff/D70885917/)

[ghstack-poisoned]
---
 backends/xnnpack/runtime/XNNWeightsCache.cpp  | 227 ++++++++++++++
 backends/xnnpack/runtime/XNNWeightsCache.h    | 166 ++++++++++
 backends/xnnpack/targets.bzl                  |   2 +-
 .../test/runtime/test_xnn_weights_cache.cpp   | 286 ++++++++++++++++++
 backends/xnnpack/test/targets.bzl             |  10 +
 extension/testing_util/targets.bzl            |   1 +
 6 files changed, 691 insertions(+), 1 deletion(-)
 create mode 100644 backends/xnnpack/runtime/XNNWeightsCache.cpp
 create mode 100644 backends/xnnpack/runtime/XNNWeightsCache.h
 create mode 100644 backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp

diff --git a/backends/xnnpack/runtime/XNNWeightsCache.cpp b/backends/xnnpack/runtime/XNNWeightsCache.cpp
new file mode 100644
index 00000000000..199e58b3f5b
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWeightsCache.cpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/error.h>
+#include <sys/stat.h>
+#include <xnnpack.h>
+namespace executorch {
+namespace backends {
+namespace xnnpack {
+namespace delegate {
+
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+
+XNNWeightsCache::XNNWeightsCache()
+ {
+  weights_cache_.context = this;
+  weights_cache_.look_up = (size_t(*)(
+      void*, const xnn_weights_cache_look_up_key*))XNNWeightsCache::look_up;
+  weights_cache_.reserve_space =
+      (void* (*)(void*, size_t))XNNWeightsCache::reserve_space;
+  weights_cache_.look_up_or_insert =
+      (size_t(*)(void*, const xnn_weights_cache_look_up_key*, void*, size_t))
+          XNNWeightsCache::look_up_or_insert;
+  weights_cache_.is_finalized = (bool (*)(void*))XNNWeightsCache::is_finalized;
+  weights_cache_.offset_to_addr =
+      (void* (*)(void*, size_t))XNNWeightsCache::offset_to_addr;
+  weights_cache_.delete_cache =
+      (enum xnn_status(*)(void*))XNNWeightsCache::delete_cache;
+}
+
+Error XNNWeightsCache::initialize_for_runtime(
+    MemoryAllocator* runtime_allocator, 
+    const NamedDataMap* named_data_map)
+  {
+
+  runtime_allocator_ = runtime_allocator;
+  named_data_map_ = named_data_map;
+  is_finalized_ = false;
+
+  return Error::Ok;
+}
+
+Result<std::vector<std::string>> XNNWeightsCache::finalize_for_runtime(){
+  is_finalized_ = true;
+
+  // All data has been packed by create_runtime
+  // so we clear the unpacked data as it is no longer needed
+  for (FreeableBuffer& buffer : unpacked_data_){
+    buffer.Free();
+  }
+  unpacked_data_.clear();
+  unpacked_data_to_name_.clear();
+
+  std::vector<std::string> packed_data_names;
+  // update the reference count of all the packed data
+  // used by this runtime
+  for (auto& entry : name_to_packed_data_metadata_){
+    if (entry.second.in_current_runtime){
+      entry.second.ref_count++;
+      entry.second.in_current_runtime = false;
+      packed_data_names.push_back(entry.first);
+    }
+  }
+
+  return packed_data_names;
+}
+
+
+Result<const uint8_t*> XNNWeightsCache::load_unpacked_data(const std::string& name){
+  Result<FreeableBuffer> named_data = named_data_map_->get_data(name.c_str());
+  if (!named_data.ok()){
+    ET_LOG(Error, "Failed to load constant data for key %s", name.c_str());
+    return Error::InvalidExternalData;
+  }
+  const uint8_t* data_pointer = static_cast<const uint8_t*>(named_data.get().data());
+  unpacked_data_.push_back(std::move(named_data.get()));
+  unpacked_data_to_name_[data_pointer] = name;
+
+  return data_pointer;
+}
+
+Error XNNWeightsCache::delete_packed_data(const std::vector<std::string>& packed_data_names){
+  if (!is_finalized_){
+    ET_LOG(Error, "Error, attempted to delete packed data from the cache but the cache is not finalized");
+    return Error::InvalidArgument;
+  }
+  for (const std::string& name : packed_data_names){
+    auto entry = name_to_packed_data_metadata_.find(name);
+    if (entry == name_to_packed_data_metadata_.end()){
+      ET_LOG(Error, "Error, attempted to deleted packed data: %s, from the cache but it wasn't found", name.c_str());
+      return Error::InvalidArgument;
+    } else {
+      entry->second.ref_count--;
+      if (entry->second.ref_count == 0) {
+        void* packed_data_ptr = packed_data_ptrs_[entry->second.offset];
+        // Erase the key/value from the map frees the pointer holding the packed data
+        packed_pointer_to_container_.erase(packed_data_ptr);
+        // remove the pointer from the packed_data_ptrs_
+        packed_data_ptrs_[entry->second.offset] = nullptr;
+        // Erase the name to packed metadata entry
+        name_to_packed_data_metadata_.erase(entry->first);
+      }
+    }
+  }
+
+  return Error::Ok;
+}
+
+
+size_t XNNWeightsCache::look_up(
+    XNNWeightsCache* context,
+    const xnn_weights_cache_look_up_key* cache_key) {
+  const void* unpacked_weights_ptr = cache_key->kernel;
+  const void* unpacked_bias_ptr = cache_key->bias;
+  auto entry = context->unpacked_data_to_name_.find(unpacked_weights_ptr);
+
+  // Check if weight_pointer has been cached
+  if (entry == context->unpacked_data_to_name_.end()){
+    return SIZE_MAX;
+  }
+
+  std::string weight_bias_name = entry->second;
+
+  // Check if bias_pointer has been cached
+  if (unpacked_bias_ptr != nullptr){
+    auto bias_entry = context->unpacked_data_to_name_.find(unpacked_bias_ptr);
+    if (bias_entry != context->unpacked_data_to_name_.end()){
+      weight_bias_name.append(bias_entry->second);
+    }
+  }
+
+  // check if weight_bias_name has been packed already
+  auto packed_weight_entry = context->name_to_packed_data_metadata_.find(weight_bias_name);
+  if (packed_weight_entry == context->name_to_packed_data_metadata_.end()){
+    return SIZE_MAX;
+  }
+  packed_weight_entry->second.in_current_runtime = true;
+
+  return packed_weight_entry->second.offset;
+}
+
+void* XNNWeightsCache::reserve_space(XNNWeightsCache* context, size_t n) {
+  // MemoryAllocator* allocator = context->runtime_allocator_;
+  // void* reserved_pointer = allocator->allocate(n, context->kPackedAllocationAlignment);
+  
+  // return reserved_pointer;
+  std::string data_container;
+  data_container.resize(n + context->kPackedAllocationAlignment);
+  void* maybe_aligned_space = data_container.data();
+  void* aligned_space = (void*)((intptr_t)maybe_aligned_space + 64 -
+                                (intptr_t)maybe_aligned_space % 64);
+
+  context->packed_pointer_to_container_[aligned_space] = std::move(data_container);
+  return aligned_space;
+}
+
+size_t XNNWeightsCache::look_up_or_insert(
+    XNNWeightsCache* context,
+    const xnn_weights_cache_look_up_key* cache_key,
+    void* ptr,
+    size_t size) {
+  size_t offset = context->look_up(context, cache_key);
+
+  if (offset != SIZE_MAX) {
+    void* saved_ptr = context->offset_to_addr(context, offset);
+    if (0 == memcmp(ptr, saved_ptr, size)) {
+      return offset;
+    }
+    // Failure, cache is out of date
+    return SIZE_MAX;
+  }
+
+  // Add to Cache if it is not finalized
+  size_t next_offset = context->packed_data_ptrs_.size();
+  auto entry = context->unpacked_data_to_name_.find(cache_key->kernel);
+
+  // Check if weight_pointer has been cached
+  if (entry != context->unpacked_data_to_name_.end()){
+    std::string weight_bias_name = entry->second;
+    if (cache_key->bias != nullptr){
+      auto bias_entry = context->unpacked_data_to_name_.find(cache_key->bias);
+      if (bias_entry != context->unpacked_data_to_name_.end()){
+        weight_bias_name.append(bias_entry->second);
+      }
+    }
+    PackedDataMeta packed_data_metadata = {
+      .offset=next_offset,
+      .ref_count = 0, // ref_count is only incremented after finalizing for runtime
+      .in_current_runtime = true
+    };
+    context->name_to_packed_data_metadata_[weight_bias_name] = packed_data_metadata;
+  } else{
+    ET_LOG(
+      Info, 
+      "Warning: Unpacked weight and bias were not registered with names, "
+      "this will add new cache entries for packed data and may affect performance."
+    );
+  }
+  context->packed_data_ptrs_.push_back(ptr);
+
+  return next_offset;
+}
+
+bool XNNWeightsCache::is_finalized(XNNWeightsCache* context) {
+  return context->is_finalized_;
+}
+
+void* XNNWeightsCache::offset_to_addr(XNNWeightsCache* context, size_t offset) {
+  return context->packed_data_ptrs_[offset];
+}
+
+enum xnn_status XNNWeightsCache::delete_cache(XNNWeightsCache* context) {
+  return xnn_status_success;
+}
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace executor
+} // namespace torch
diff --git a/backends/xnnpack/runtime/XNNWeightsCache.h b/backends/xnnpack/runtime/XNNWeightsCache.h
new file mode 100644
index 00000000000..39c82e5973e
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWeightsCache.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <xnnpack.h>
+
+#include <executorch/runtime/executor/pte_data_map.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <string>
+#include <array>
+#include <unordered_map>
+
+namespace executorch {
+namespace backends {
+namespace xnnpack {
+namespace delegate {
+
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::FreeableBuffer;
+
+struct PackedDataMeta {
+  size_t offset;
+  // Count number of xnn_runtime_t this packed data is used in
+  size_t ref_count;
+  // true if this packed data was inserted or looked up for the
+  // current runtime being created
+  bool in_current_runtime;
+};
+
+class XNNWeightsCache {
+ public:
+  XNNWeightsCache();
+
+  /**
+   * Initializes the XNNWeightsCache for the next xnn_create_runtime
+   */
+  Error initialize_for_runtime(
+    MemoryAllocator* runtime_allocator, 
+    const NamedDataMap* named_data_map);
+
+  /**
+   * Finalizes the weights cache after the weights have been packed
+   * in xnn_create_runtime. 
+   *
+   * This should only be called after creating the runtime. Returns
+   * the name of all the packed weights used by this runtime
+   */
+  Result<std::vector<std::string>> finalize_for_runtime();
+
+  // Taken from XNN_ALLOCATION_ALIGNMENT in xnnpack/common.h
+  static const size_t kPackedAllocationAlignment = 64;
+
+  /**
+   * Returns XNNPACK's underlying weights_cache pointer
+   */
+  inline xnn_weights_cache_t get() {
+    return (xnn_weights_cache_t)&weights_cache_;
+  }
+
+  /**
+   * Returns the number of unpacked data
+   */
+  inline size_t get_num_unpacked_data(){
+    return unpacked_data_.size();
+  };
+
+  /**
+   * Returns the names of all unpacked data 
+   */
+  inline std::vector<std::string> get_unpacked_data_names(){
+    std::vector<std::string> names;
+    for (const auto& pair : unpacked_data_to_name_) {
+      names.push_back(pair.second);
+    }
+    return names;
+  };
+
+  /**
+   * Returns the packed data names
+   */
+  inline std::vector<std::string> get_packed_data_names(){
+    std::vector<std::string> names;
+    for (const auto& pair : name_to_packed_data_metadata_) {
+      names.push_back(pair.first);
+    }
+    return names;
+  };
+
+
+  /**
+   * Loads unpacked named data from the NamedDataMap into this XNNWeightsCache
+   * and returns a pointer to the unpacked data. This unpacked data is given
+   * to XNNPACK's define_tensor APIs, and used as the cache key for look_up_or_insert.
+   * @param[in] name The name of the data to load
+   * @param[out] out the pointer to the unpacked data that was loaded
+   */
+  Result<const uint8_t*> load_unpacked_data(const std::string& name);
+
+  /**
+   * Deletes the packed data associated with the names given. 
+   * Decrements the ref_count if the packed data is used by other
+   * models
+   * 
+   */
+   Error delete_packed_data(const std::vector<std::string>& packed_names);
+
+
+ private:
+  // Runtime Allocator used to reserve memory for packed weights
+  MemoryAllocator* runtime_allocator_;
+
+  // Named Data Map used to load named data
+  const NamedDataMap* named_data_map_;
+
+  // Map of unpacked pointers to the data name
+  std::unordered_map<const void*, std::string> unpacked_data_to_name_;
+  // Map of data names to offset into the packed data
+  std::unordered_map<std::string, PackedDataMeta> name_to_packed_data_metadata_;
+  // Vector holding list of pointers to the packed data
+  std::vector<void*> packed_data_ptrs_;
+  // vector holding list of strings which are containers for packed_data_ptrs
+  std::unordered_map<void*, std::string> packed_pointer_to_container_;
+  // Vector hodling list of unpacked freeable buffers
+  std::vector<FreeableBuffer> unpacked_data_;
+  // xnnpack's weight cache provider
+  xnn_weights_cache_provider weights_cache_;
+  // whether or not the weight cache is finalized
+  bool is_finalized_;
+
+  // Function pointers to override XNNPACK's default xnn_weights_cache_provider
+  // functions.
+  static size_t look_up(
+      XNNWeightsCache* context,
+      const xnn_weights_cache_look_up_key* cache_key);
+
+  static void* reserve_space(XNNWeightsCache* context, size_t n);
+
+  static size_t look_up_or_insert(
+      XNNWeightsCache* context,
+      const xnn_weights_cache_look_up_key* cache_key,
+      void* ptr,
+      size_t size);
+
+  static bool is_finalized(XNNWeightsCache* context);
+
+  static void* offset_to_addr(XNNWeightsCache* context, size_t offset);
+
+  static enum xnn_status delete_cache(XNNWeightsCache* context);
+
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace executor
+} // namespace torch
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index 3fd9c433372..bb7f1979d3a 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -53,10 +53,10 @@ def define_common_targets():
             # "-DENABLE_XNNPACK_KLEIDI"
         ] + _get_preprocessor_flags(),
         exported_deps = [
+            third_party_dep("XNNPACK"),
             "//executorch/runtime/backend:interface",
         ],
         deps = [
-            third_party_dep("XNNPACK"),
             "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header",
             "//executorch/extension/threadpool:threadpool",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
diff --git a/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
new file mode 100644
index 00000000000..a6ca73331c1
--- /dev/null
+++ b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
+	
+#include <executorch/runtime/executor/pte_data_map.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/testing_util/temp_file.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/schema/program_generated.h>
+#include <gtest/gtest.h>
+#include <xnnpack.h>
+
+using executorch::backends::xnnpack::delegate::XNNWeightsCache;
+using executorch::runtime::MemoryAllocator;
+using executorch::extension::FileDataLoader;
+using executorch::extension::testing::TempFile;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+using executorch::runtime::internal::PteDataMap;
+
+class XNNWeightsCacheTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Creating a NamedDataMap from scratch is a little bit convoluted, so
+    // we copied a lot of setup from test_pte_data_map.cpp
+
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Create a sample Program with only named_data and segments. Technically
+    // not a valid Program; only used to test the PteDataMap.
+    // Create named data.
+    std::array<const flatbuffers::Offset<executorch_flatbuffer::NamedData>, 2>
+        named_data_arr = {
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "weight", /*segment_index=*/0),
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "bias", /*segment_index=*/1),
+        };
+    const auto named_data =
+        builder_.CreateVector(named_data_arr.data(), named_data_arr.size());
+
+    // Create segments.
+    std::array<const flatbuffers::Offset<executorch_flatbuffer::DataSegment>, 2>
+        segment_arr = {// @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment
+                       executorch_flatbuffer::CreateDataSegment(
+                           builder_, /*offset=*/0, /*size=*/kSegmentSizes[0]),
+                       // @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment
+                       executorch_flatbuffer::CreateDataSegment(
+                           builder_,
+                           /*offset=*/kSegmentAlignment * 2,
+                           /*size=*/kSegmentSizes[1])};
+    const auto segments =
+        builder_.CreateVector(segment_arr.data(), segment_arr.size());
+
+    // Create Program.
+    const auto program = executorch_flatbuffer::CreateProgram(
+        builder_, 0, 0, 0, 0, segments, 0, 0, named_data);
+
+    builder_.Finish(program);
+    program_ = executorch_flatbuffer::GetProgram(builder_.GetBufferPointer());
+
+    // Create sample segment data.
+    for (int i = 0; i < kSegmentSizes[0]; i++) {
+      sample_data_[i] = 1;
+    }
+    for (int i = kSegmentOffsets[1]; i < kSegmentOffsets[1] + kSegmentSizes[1];
+         i++) {
+      sample_data_[i] = 2;
+    }
+    TempFile tf(sample_data_.data(), sizeof(sample_data_));
+
+    // Wrap the sample data in a loader.
+    Result<FileDataLoader> loader =
+        FileDataLoader::from(tf.path().c_str(), kSegmentAlignment);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    data_map_loader_ =
+        std::make_unique<FileDataLoader>(std::move(loader.get()));
+    
+    Result<PteDataMap> data_map = PteDataMap::create(
+      data_map_loader_.get(), 0, program_->named_data(), program_->segments());
+    ASSERT_EQ(data_map.error(), Error::Ok);
+    data_map_ = std::make_unique<PteDataMap>(std::move(data_map.get()));
+
+    memory_allocator_ = std::make_unique<MemoryAllocator>(
+        memory_allocator_data_.size(), memory_allocator_data_.data());
+    	
+
+    xnn_status status = xnn_initialize(nullptr);
+    ASSERT_EQ(status, xnn_status_success);
+  }
+
+  void BuildAndRunGraphWithWeightsCache(
+    XNNWeightsCache& weight_cache,
+    const std::vector<size_t>& batches,
+    size_t input_channels,
+    size_t output_channels,
+    float* input_data,
+    float* output_data
+  ){
+    // Defining subgraph
+    xnn_subgraph_t subgraph_ptr = nullptr;
+    xnn_status status = xnn_create_subgraph(
+        /*external_value_ids=*/2,
+        /*flags=*/0,
+        &subgraph_ptr);
+    ASSERT_EQ(status, xnn_status_success);
+    std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
+        subgraph_ptr, &xnn_delete_subgraph);
+
+    // Define tensors
+    // Define input
+    uint32_t input_id;
+    std::vector<size_t> input_dims(batches);
+    input_dims.push_back(input_channels);
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        input_dims.size(),
+        input_dims.data(),
+        nullptr,
+        0,
+        XNN_VALUE_FLAG_EXTERNAL_INPUT,
+        &input_id);
+
+    // Define weight
+    uint32_t weight_id;
+    Result<const uint8_t*> weight_pointer = weight_cache.load_unpacked_data(
+      "weight"
+    );
+    ASSERT_TRUE(weight_pointer.ok());
+    ASSERT_TRUE(weight_pointer.get() != nullptr);
+    std::vector<size_t> weight_dims{output_channels, input_channels};
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        weight_dims.size(),
+        weight_dims.data(),
+        weight_pointer.get(),
+        XNN_INVALID_VALUE_ID,
+        0,
+        &weight_id);
+    ASSERT_EQ(status, xnn_status_success);
+
+    // Define bias
+    uint32_t bias_id;
+    Result<const uint8_t*> bias_pointer = weight_cache.load_unpacked_data(
+      "bias"
+    );
+    ASSERT_TRUE(bias_pointer.ok());
+    std::vector<size_t> bias_dims{output_channels};
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        bias_dims.size(),
+        bias_dims.data(),
+        bias_pointer.get(),
+        XNN_INVALID_VALUE_ID,
+        0,
+        &bias_id);
+
+
+    // Define output tensor
+    uint32_t output_id;
+    std::vector<size_t> output_dims(batches);
+    output_dims.push_back(output_channels);
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        output_dims.size(),
+        output_dims.data(),
+        nullptr,
+        1,
+        XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
+        &output_id);
+
+    // create xecond fully connected
+    status = xnn_define_fully_connected(
+        subgraph_ptr,
+        -std::numeric_limits<float>::infinity(),
+        std::numeric_limits<float>::infinity(),
+        input_id,
+        weight_id,
+        bias_id,
+        output_id,
+        0);
+    // Create and Pack Weights
+    xnn_runtime_t runtime_ptr = nullptr;
+    status = xnn_create_runtime_v3(
+        subgraph_ptr, weight_cache.get(), nullptr, 0, &runtime_ptr);
+    Result<std::vector<std::string>> packed_weights_added = weight_cache.finalize_for_runtime();
+    ASSERT_TRUE(packed_weights_added.ok());
+    ASSERT_EQ(packed_weights_added.get().size(), 1);
+    ASSERT_EQ(packed_weights_added.get()[0], "weightbias");
+
+    auto runtime = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
+        runtime_ptr, xnn_delete_runtime);
+
+    const std::array<xnn_external_value, 2> external = {
+        xnn_external_value{0, input_data},
+        xnn_external_value{1, output_data},
+    };
+
+    status = xnn_reshape_runtime(runtime.get());
+    status =
+        xnn_setup_runtime_v2(runtime.get(), external.size(), external.data());
+
+    ASSERT_EQ(status, xnn_status_success);
+    status = xnn_invoke_runtime(runtime.get());
+    ASSERT_EQ(status, xnn_status_success);
+  }
+
+  // Program builder constants.
+  static constexpr int kSegmentAlignment = 16;
+  static constexpr std::array<int, 2> kSegmentSizes{384, 128};
+  static constexpr std::array<int, 2> kSegmentOffsets{0, kSegmentAlignment * 2};
+  std::array<uint8_t, 512> sample_data_;
+
+  // Program builder.
+  flatbuffers::FlatBufferBuilder builder_;
+  const executorch_flatbuffer::Program* program_;
+
+  // Data loader for the sample data.
+  std::unique_ptr<FileDataLoader> data_map_loader_;
+
+  // PteDataMap
+  std::unique_ptr<PteDataMap> data_map_;
+
+  // MemoryAllocator
+  std::array<uint8_t, 200> memory_allocator_data_;
+  std::unique_ptr<MemoryAllocator> memory_allocator_;
+};
+
+
+TEST_F(XNNWeightsCacheTest, ReusePackedWeights) {
+    XNNWeightsCache weight_cache;
+    size_t padding = 32;
+    
+    std::vector<size_t> batches{1, 2, 3};
+    size_t num_batches = 1;
+    for (size_t batch_dim : batches) {
+      num_batches *= batch_dim;
+    }
+    size_t input_channels = 3;
+    size_t output_channels = 4;	
+    std::vector<float> input_tensor(num_batches * input_channels + padding, 1.0f);
+    std::vector<float> output_tensor(num_batches * output_channels, 0.0f);
+    float* input_data = input_tensor.data();
+    float* output_data = output_tensor.data();
+    weight_cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+      weight_cache,
+      batches,
+      input_channels,
+      output_channels,
+      input_data,
+      output_data
+    );
+
+    weight_cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+      weight_cache,
+      batches,
+      input_channels,
+      output_channels,
+      input_data,
+      output_data
+    );
+    ASSERT_EQ(weight_cache.get_num_unpacked_data(), 0);
+    weight_cache.delete_packed_data(weight_cache.get_packed_data_names());
+    std::vector<std::string> packed_data_names = weight_cache.get_packed_data_names();
+    // check packed data names have been deleted
+    ASSERT_EQ(packed_data_names.size(), 0);
+}
diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl
index 30ce970a842..052e7ea91d5 100644
--- a/backends/xnnpack/test/targets.bzl
+++ b/backends/xnnpack/test/targets.bzl
@@ -27,6 +27,16 @@ def define_common_targets():
             third_party_dep("FP16"),
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "test_xnn_weights_cache",
+        srcs = ["runtime/test_xnn_weights_cache.cpp"],
+        deps = [
             "//executorch/backends/xnnpack:xnnpack_backend",
+            "//executorch/runtime/executor:pte_data_map",
+            "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/extension/testing_util:temp_file",
         ],
     )
diff --git a/extension/testing_util/targets.bzl b/extension/testing_util/targets.bzl
index 2b12480dfff..95b1f94d182 100644
--- a/extension/testing_util/targets.bzl
+++ b/extension/testing_util/targets.bzl
@@ -17,5 +17,6 @@ def define_common_targets():
             "//executorch/extension/fb/ptez/decompression_methods/test/...",
             "//executorch/extension/fb/ptez/test/...",
             "//executorch/runtime/executor/test/...",
+            "//executorch/backends/xnnpack/test/...",
         ],
     )