diff --git a/src/gpu.cpp b/src/gpu.cpp
index 00a711d0951..06c7c29b4d9 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -3734,7 +3734,7 @@ int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSe
     return 0;
 }
 
-int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const
+int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipelineCache* vk_pipeline_cache, VkPipeline* pipeline) const
 {
     const int specialization_count = specializations.size();
 
@@ -3792,7 +3792,11 @@ int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout
     computePipelineCreateInfo.basePipelineHandle = 0;
     computePipelineCreateInfo.basePipelineIndex = 0;
 
-    VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline);
+    VkResult ret;
+    if (vk_pipeline_cache != VK_NULL_HANDLE)
+        ret = vkCreateComputePipelines(d->device, *vk_pipeline_cache, 1, &computePipelineCreateInfo, 0, pipeline);
+    else
+        ret = vkCreateComputePipelines(d->device, VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, pipeline);
     if (ret != VK_SUCCESS)
     {
         NCNN_LOGE("vkCreateComputePipelines failed %d", ret);
@@ -3801,6 +3805,40 @@ int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout
 
     return 0;
 }
+int VulkanDevice::create_empty_pipeline_cache(VkPipelineCache* vk_pipeline_cache) const
+{
+    VkPipelineCacheCreateInfo info;
+    info.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
+    info.pNext = 0;
+    info.flags = VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT;
+    info.initialDataSize = 0;
+    info.pInitialData = 0;
+    VkResult ret = vkCreatePipelineCache(d->device, &info, 0, vk_pipeline_cache);
+    if (ret != VK_SUCCESS)
+    {
+        NCNN_LOGE("vkCreatePipelineCache failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
+int VulkanDevice::create_pipeline_cache_with_data(const void* initial_data, size_t data_size, VkPipelineCache* vk_pipeline_cache) const
+{
+    VkPipelineCacheCreateInfo info;
+    info.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
+    info.pNext = 0;
+    info.flags = VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT;
+    info.initialDataSize = data_size;
+    info.pInitialData = initial_data;
+    VkResult ret = vkCreatePipelineCache(d->device, &info, 0, vk_pipeline_cache);
+    if (ret != VK_SUCCESS)
+    {
+        NCNN_LOGE("vkCreatePipelineCache failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
 
 int VulkanDevice::create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const
 {
diff --git a/src/gpu.h b/src/gpu.h
index 7863b2e21a4..5037922d0b8 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -419,7 +419,10 @@ class NCNN_EXPORT VulkanDevice
     // helper for creating pipeline
     int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
     int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
-    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipelineCache* vk_pipeline_cache, VkPipeline* pipeline) const;
+    int create_empty_pipeline_cache(VkPipelineCache* vk_pipeline_cache) const;
+    int create_pipeline_cache_with_data(const void* initial_data, size_t data_size, VkPipelineCache* vk_pipeline_cache) const;
+
     int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
 
     uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
diff --git a/src/pipelinecache.cpp b/src/pipelinecache.cpp
index 1bd27451440..4dfda99b694 100644
--- a/src/pipelinecache.cpp
+++ b/src/pipelinecache.cpp
@@ -5,8 +5,21 @@
 
 #include "gpu.h"
 
+#include <cstdio>
+#include <map>
+#include <mutex>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <direct.h>
+#else
+#include <sys/stat.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <cstring>
+#include <cerrno>
+#endif
 namespace ncnn {
-
 #if NCNN_VULKAN
 // https://en.wikipedia.org/wiki/MurmurHash
 static uint32_t murmur3_32(const uint32_t* data, int size)
@@ -51,9 +64,114 @@ static uint32_t fnv1a_32(const uint8_t* data, int size)
     return h;
 }
 
+static int atomic_rename(const char* old_path, const char* new_path)
+{
+#ifdef _WIN32
+    if (MoveFileExA(old_path, new_path, MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH))
+        return 0;
+    return -1;
+#else
+    return std::rename(old_path, new_path);
+#endif // _WIN32
+}
+
+static int make_dir(const std::string& dirpath)
+{
+    if (dirpath.empty())
+        return -1;
+
+    std::string dir = dirpath;
+
+#ifdef _WIN32
+    for (size_t i = 0; i < dir.size(); i++)
+    {
+        if (dir[i] == '/')
+            dir[i] = '\\';
+    }
+
+    size_t start = (dir.size() > 2 && dir[1] == ':') ? 3 : 0;
+
+    for (size_t i = start; i <= dir.size(); i++)
+    {
+        if (i == dir.size() || dir[i] == '\\')
+        {
+            char tmp = dir[i];
+            dir[i] = '\0';
+            if (_mkdir(dir.c_str()) != 0 && errno != EEXIST)
+            {
+                return -1;
+            }
+            dir[i] = tmp;
+        }
+    }
+#else
+    size_t start = dir[0] == '/' ? 1 : 0;
+
+    for (size_t i = start; i <= dir.size(); i++)
+    {
+        if (i == dir.size() || dir[i] == '/')
+        {
+            char tmp = dir[i];
+            dir[i] = '\0';
+            if (mkdir(dir.c_str(), 0755) != 0 && errno != EEXIST)
+            {
+                return -1;
+            }
+            dir[i] = tmp;
+        }
+    }
+#endif
+
+    return 0;
+}
+
+static constexpr uint32_t spv_cache_magic()
+{
+    return ('S' | 'P' << 8 | 'V' << 16 | 'C' << 24);
+}
+
+enum class PipelineCacheIOResult
+{
+    Success,
+    FileFailure,
+    InvalidFile,
+    InvalidCache,
+    DataCorruption,
+    CreationFailure,
+};
+
 class PipelineCachePrivate
 {
 public:
+    static constexpr uint32_t CURRENT_SPV_CACHE_HEADER_VERSION = 1;
+    static constexpr uint32_t CURRENT_PIPELINE_CACHE_VERSION = 1;
+
+    PipelineCachePrivate()
+    {
+#ifdef _WIN32
+        shader_cache_dir = std::string(getenv("LOCALAPPDATA") ? getenv("LOCALAPPDATA") : ".") + "/ncnn/shadercache";
+#else
+        shader_cache_dir = std::string(getenv("HOME") ? getenv("HOME") : ".") + "/.ncnn/shadercache";
+#endif
+    }
+
+    struct pipeline_cache_prefix_header
+    {
+        uint32_t magic;
+        uint32_t version;
+        uint32_t data_size;
+        uint32_t data_hash_fnv1a; // fnv1a hash
+
+        uint32_t vendor_id;
+        uint32_t device_id;
+        uint32_t driver_version;
+        uint32_t driver_abi;
+
+        uint8_t uuid[VK_UUID_SIZE];
+
+        uint32_t reserved[4];
+    };
+
     // digest -> artifact
     struct pipeline_cache_digest
     {
@@ -110,9 +228,78 @@ class PipelineCachePrivate
         ShaderInfo shader_info; // TODO use pointer ?
     };
 
+    struct spv_cache_header
+    {
+        uint32_t magic;          // magic number, 'SPVC' in host endian
+        uint32_t header_version; // version of cache header format
+        uint32_t ncnn_version;   // ncnn version when the cache is created
+        // if ncnn upgrade and update glslang, shader code or preprocessing steps
+        // we want the cache to be invalid
+
+        uint32_t spv_size;          // size of spv binary data
+        uint32_t data_hash_fnv1a;   // hash of spv binary data using fnv1a
+        uint32_t data_hash_murmur3; // second hash of spv binary data using murmur3
+
+        // since a driver update/device switch might lead changes to supported extensions
+        // and change the defines added in code, we want to verify that the cache is valid for the current device
+        uint32_t vendor_id;
+        uint32_t device_id;
+        uint32_t driver_version;
+        uint8_t uuid[VK_UUID_SIZE];
+        uint32_t reserved[4]; // reserved for future use, must be zero
+    };
+
     mutable std::vector<pipeline_cache_digest> cache_digests;
     mutable std::vector<pipeline_cache_artifact> cache_artifacts;
+    mutable std::map<uint64_t, std::vector<uint32_t> > spv_code_cache;
+    mutable VkPipelineCache pipeline_cache = VK_NULL_HANDLE;
     mutable Mutex cache_lock;
+    mutable std::string shader_cache_dir;
+
+    int load_spv_code_cache_from_disk(const VulkanDevice& device, uint64_t shader_key) const;
+    PipelineCacheIOResult try_load_pipeline_cache_from_disk(const VulkanDevice* vkdev, const char* path);
+    int save_spv_code_cache_to_disk(uint64_t shader_key, const VulkanDevice& device, const std::vector<uint32_t>& spirv) const;
+
+    static constexpr uint32_t vk_pipeline_cache_header_magic()
+    {
+        return ('V' | 'P' << 8 | 'C' << 16 | 'H' << 24); // Vulkan Pipeline Cache Header
+    }
+
+    static bool validate_pipeline_cache_header(const pipeline_cache_prefix_header& header, const VkPhysicalDeviceProperties& physical_device_properties)
+    {
+        if (header.magic != vk_pipeline_cache_header_magic())
+            return false;
+        if (header.vendor_id != physical_device_properties.vendorID)
+            return false;
+        if (header.device_id != physical_device_properties.deviceID)
+            return false;
+        if (header.driver_version != physical_device_properties.driverVersion)
+            return false;
+        if (header.driver_abi != sizeof(void*))
+            return false;
+        if (memcmp(header.uuid, physical_device_properties.pipelineCacheUUID, VK_UUID_SIZE) != 0)
+            return false;
+        return true;
+    }
+
+    static bool validate_spv_code_cache(const spv_cache_header& header, const VkPhysicalDeviceProperties& physical_device_properties)
+    {
+        if (header.magic != spv_cache_magic())
+            return false;
+        if (header.header_version != CURRENT_SPV_CACHE_HEADER_VERSION)
+            return false;
+        if (header.vendor_id != physical_device_properties.vendorID)
+            return false;
+        if (header.device_id != physical_device_properties.deviceID)
+            return false;
+        if (header.driver_version != physical_device_properties.driverVersion)
+            return false;
+        if (header.spv_size % 4 != 0)
+            return false;
+        if (memcmp(header.uuid, physical_device_properties.pipelineCacheUUID, VK_UUID_SIZE) != 0)
+            return false;
+        return true;
+    }
 };
 
 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
@@ -133,18 +320,36 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_
     specializations_fnv1a = fnv1a_32((const uint8_t*)specializations.data(), specialization_count * sizeof(vk_specialization_type));
 }
 
+static uint32_t encode_opt_bits(const Option& opt)
+{
+    return 0 << 7
+           | opt.use_fp16_packed << 6
+           | opt.use_fp16_storage << 5
+           | opt.use_fp16_arithmetic << 4
+           | opt.use_int8_storage << 3
+           | opt.use_int8_arithmetic << 2;
+}
+
+static uint64_t shader_spv_key(int shader_type_index, const Option& opt)
+{
+    // TODO: if shader code is changed, using shader_type_index is not enough
+    return static_cast<uint64_t>(shader_type_index) << 32
+           | static_cast<uint64_t>(opt.use_fp16_uniform) << 31
+           | static_cast<uint64_t>(opt.use_int8_uniform) << 30
+           | static_cast<uint64_t>(opt.use_int8_packed) << 29
+           | static_cast<uint64_t>(opt.use_subgroup_ops) << 28
+           | static_cast<uint64_t>(opt.use_shader_pack8) << 27
+           | static_cast<uint64_t>(opt.use_shader_local_memory) << 26
+           | encode_opt_bits(opt);
+}
+
 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
         uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
 {
     shader_type_index = _shader_type_index;
 
     // encode opt
-    opt_bits = 0 << 7
-               | opt.use_fp16_packed << 6
-               | opt.use_fp16_storage << 5
-               | opt.use_fp16_arithmetic << 4
-               | opt.use_int8_storage << 3
-               | opt.use_int8_arithmetic << 2;
+    opt_bits = encode_opt_bits(opt);
 
     local_size_x = _local_size_x;
     local_size_y = _local_size_y;
@@ -216,6 +421,14 @@ void PipelineCache::clear()
         }
     }
 
+    if (d->pipeline_cache)
+    {
+        vkDestroyPipelineCache(vkdev->vkdevice(), d->pipeline_cache, 0);
+        d->pipeline_cache = VK_NULL_HANDLE;
+    }
+
+    d->spv_code_cache.clear();
+
     d->cache_digests.clear();
     d->cache_artifacts.clear();
 }
@@ -334,8 +547,6 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const
             *descriptor_update_template = cc.descriptor_update_template;
             shader_info = cc.shader_info;
 
-            // NCNN_LOGE("get_pipeline hit %d", last_digest_index);
-
             return 0;
         }
     }
@@ -381,20 +592,322 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const
     return 0;
 }
 
+int PipelineCachePrivate::load_spv_code_cache_from_disk(const VulkanDevice& device, uint64_t shader_key) const
+{
+    std::string cachepath = shader_cache_dir + "/" + std::to_string(shader_key) + ".spvcache";
+
+    FILE* fp = fopen(cachepath.c_str(), "rb");
+    if (!fp)
+    {
+        return -1;
+    }
+
+    spv_cache_header header;
+    if (fread(&header, sizeof(header), 1, fp) != 1)
+    {
+        NCNN_LOGE("load_spv_code_cache_from_disk fread header failed");
+        fclose(fp);
+        return -1;
+    }
+
+    if (!validate_spv_code_cache(header, device.info.physicalDeviceProperties()))
+    {
+        NCNN_LOGE("load_spv_code_cache_from_disk validate_spv_code_cache failed");
+        fclose(fp);
+        return -1;
+    }
+
+    std::vector<uint32_t> spirv;
+    spirv.resize(header.spv_size / 4);
+    size_t nread = fread(spirv.data(), 1, header.spv_size, fp);
+    fclose(fp);
+
+    if (nread != header.spv_size)
+    {
+        NCNN_LOGE("load_spv_code_cache_from_disk fread spirv data failed %zu != %d", nread, header.spv_size);
+        return -1;
+    }
+
+    uint32_t hash_fnv1a = fnv1a_32(reinterpret_cast<const uint8_t*>(spirv.data()), header.spv_size);
+    if (hash_fnv1a != header.data_hash_fnv1a)
+    {
+        NCNN_LOGE("load_spv_code_cache_from_disk data hash1 mismatch %x != %x", hash_fnv1a, header.data_hash_fnv1a);
+        return -1;
+    }
+
+    uint32_t hash_murmur3 = murmur3_32(spirv.data(), spirv.size());
+    if (hash_murmur3 != header.data_hash_murmur3)
+    {
+        NCNN_LOGE("load_spv_code_cache_from_disk data hash2 mismatch %x != %x", hash_murmur3, header.data_hash_murmur3);
+        return -1;
+    }
+
+    spv_code_cache[shader_key] = std::move(spirv);
+    return 0;
+}
+PipelineCacheIOResult PipelineCachePrivate::try_load_pipeline_cache_from_disk(const VulkanDevice* vkdev, const char* path)
+{
+    FILE* file = fopen(path, "rb");
+    if (!file)
+    {
+        return PipelineCacheIOResult::FileFailure;
+    }
+
+    fseek(file, 0, SEEK_END);
+    long pos = ftell(file);
+    if (pos == -1L)
+    {
+        fclose(file);
+        return PipelineCacheIOResult::FileFailure;
+    }
+    size_t file_size = static_cast<size_t>(pos);
+    rewind(file);
+
+    if (file_size < sizeof(pipeline_cache_prefix_header))
+    {
+        fclose(file);
+        return PipelineCacheIOResult::InvalidFile;
+    }
+
+    std::vector<char> buffer(file_size - sizeof(pipeline_cache_prefix_header));
+    pipeline_cache_prefix_header header;
+    if (fread(&header, sizeof(pipeline_cache_prefix_header), 1, file) != 1)
+    {
+        fclose(file);
+        return PipelineCacheIOResult::InvalidFile;
+    }
+    if (fread(buffer.data(), 1, file_size - sizeof(pipeline_cache_prefix_header), file) != file_size - sizeof(PipelineCachePrivate::pipeline_cache_prefix_header))
+    {
+        fclose(file);
+        return PipelineCacheIOResult::DataCorruption;
+    }
+    fclose(file);
+
+    if (header.magic != vk_pipeline_cache_header_magic())
+    {
+        return PipelineCacheIOResult::InvalidCache;
+    }
+
+    if (header.version != CURRENT_PIPELINE_CACHE_VERSION)
+    {
+        return PipelineCacheIOResult::InvalidCache;
+    }
+
+    void* cache_data_begin = buffer.data();
+    const VkPhysicalDeviceProperties& device_properties = vkdev->info.physicalDeviceProperties();
+    if (!validate_pipeline_cache_header(header, device_properties))
+    {
+        return PipelineCacheIOResult::InvalidCache;
+    }
+
+    size_t cache_data_size = header.data_size;
+    if (cache_data_size == 0 || cache_data_size > buffer.size())
+    {
+        return PipelineCacheIOResult::DataCorruption;
+    }
+
+    uint64_t hash = fnv1a_32(reinterpret_cast<const uint8_t*>(cache_data_begin), cache_data_size);
+    if (hash != header.data_hash_fnv1a)
+    {
+        return PipelineCacheIOResult::DataCorruption;
+    }
+
+    if (vkdev->create_pipeline_cache_with_data(cache_data_begin, cache_data_size, &pipeline_cache) != VK_SUCCESS)
+    {
+        return PipelineCacheIOResult::CreationFailure;
+    }
+
+    return PipelineCacheIOResult::Success;
+}
+int PipelineCachePrivate::save_spv_code_cache_to_disk(uint64_t shader_key, const VulkanDevice& device, const std::vector<uint32_t>& spirv) const
+{
+    std::string cachepath = shader_cache_dir + "/" + std::to_string(shader_key) + ".spvcache";
+    std::string tmp_cachepath = cachepath + ".tmp";
+
+    make_dir(shader_cache_dir);
+
+    FILE* fp = fopen(tmp_cachepath.c_str(), "wb");
+    if (!fp)
+    {
+        NCNN_LOGE("save_spv_code_cache_to_disk fopen %s failed", tmp_cachepath.c_str());
+        return -1;
+    }
+
+    spv_cache_header header;
+    header.magic = spv_cache_magic();
+    header.header_version = CURRENT_SPV_CACHE_HEADER_VERSION;
+    header.spv_size = spirv.size() * sizeof(uint32_t);
+
+    header.data_hash_fnv1a = fnv1a_32((const uint8_t*)spirv.data(), header.spv_size);   // fnv1a hash
+    header.data_hash_murmur3 = murmur3_32((const uint32_t*)spirv.data(), spirv.size()); // murmur3 hash
+
+    const VkPhysicalDeviceProperties& physical_device_properties = device.info.physicalDeviceProperties();
+    header.vendor_id = physical_device_properties.vendorID;
+    header.device_id = physical_device_properties.deviceID;
+    header.driver_version = physical_device_properties.driverVersion;
+    memcpy(header.uuid, physical_device_properties.pipelineCacheUUID, VK_UUID_SIZE);
+    memset(header.reserved, 0, sizeof(header.reserved));
+    if (fwrite(&header, sizeof(header), 1, fp) != 1)
+    {
+        NCNN_LOGE("save_spv_code_cache_to_disk fwrite header failed");
+        fclose(fp);
+        return -1;
+    }
+
+    if (fwrite(spirv.data(), sizeof(uint32_t), spirv.size(), fp) != spirv.size())
+    {
+        NCNN_LOGE("save_spv_code_cache_to_disk fwrite spirv data failed");
+        fclose(fp);
+        return -1;
+    }
+
+    fclose(fp);
+
+    if (atomic_rename(tmp_cachepath.c_str(), cachepath.c_str()) != 0)
+    {
+        NCNN_LOGE("save_spv_code_cache_to_disk rename %s to %s failed", tmp_cachepath.c_str(), cachepath.c_str());
+        return -1;
+    }
+
+    return 0;
+}
+
+int PipelineCache::load_pipeline_cache(const char* path) const
+{
+    MutexLockGuard lock(d->cache_lock);
+    if (d->pipeline_cache != VK_NULL_HANDLE)
+    {
+        NCNN_LOGE("a valid pipeline cache already exists, stop loading");
+        return 0;
+    }
+    PipelineCacheIOResult result = d->try_load_pipeline_cache_from_disk(vkdev, path);
+    if (result == PipelineCacheIOResult::Success) return 0;
+    switch (result)
+    {
+    case PipelineCacheIOResult::FileFailure:
+        NCNN_LOGE("Failed to open pipeline cache file: %s", path);
+        break;
+    case PipelineCacheIOResult::InvalidFile:
+        NCNN_LOGE("File %s is not a valid file for pipeline cache", path);
+        break;
+    case PipelineCacheIOResult::InvalidCache:
+        NCNN_LOGE("The cache in file %s is not valid for current platform", path);
+        break;
+    case PipelineCacheIOResult::DataCorruption:
+        NCNN_LOGE("Data in file %s is corrupted", path);
+        break;
+    case PipelineCacheIOResult::CreationFailure:
+        NCNN_LOGE("Failed to create pipeline cache from data in file %s", path);
+        break;
+    default:
+        ;
+    }
+
+    NCNN_LOGE("Failed to load pipeline cache from file %s, fall back to create empty pipeline cache", path);
+    if (vkdev->create_empty_pipeline_cache(&d->pipeline_cache) != 0)
+    {
+        NCNN_LOGE("Failed to create pipeline cache");
+        return -1;
+    }
+
+    return 0;
+}
+
+int PipelineCache::save_pipeline_cache(const char* path) const
+{
+    MutexLockGuard lock(d->cache_lock);
+    if (d->pipeline_cache == VK_NULL_HANDLE) return 0;
+    size_t cache_data_size;
+    if (vkGetPipelineCacheData(vkdev->vkdevice(), d->pipeline_cache, &cache_data_size, nullptr) != VK_SUCCESS)
+    {
+        NCNN_LOGE("Failed to get pipeline cache data");
+        return -1;
+    }
+
+    std::vector<char> buffer(cache_data_size);
+    if (vkGetPipelineCacheData(vkdev->vkdevice(), d->pipeline_cache, &cache_data_size, buffer.data()) != VK_SUCCESS)
+    {
+        NCNN_LOGE("Failed to get pipeline cache data");
+        return -1;
+    }
+
+    const VkPhysicalDeviceProperties& device_properties = vkdev->info.physicalDeviceProperties();
+
+    PipelineCachePrivate::pipeline_cache_prefix_header header = {};
+    header.vendor_id = device_properties.vendorID;
+    header.device_id = device_properties.deviceID;
+    header.driver_version = device_properties.driverVersion;
+    header.driver_abi = sizeof(void*);
+    header.version = PipelineCachePrivate::CURRENT_PIPELINE_CACHE_VERSION;
+    std::copy_n(device_properties.pipelineCacheUUID, VK_UUID_SIZE, header.uuid);
+    header.data_size = cache_data_size;
+    header.magic = PipelineCachePrivate::vk_pipeline_cache_header_magic();
+
+    header.data_hash_fnv1a = fnv1a_32(reinterpret_cast<const uint8_t*>(buffer.data()), cache_data_size); // fnv1a hash
+
+    std::string expected_path = path;
+    std::string temp_file_path = expected_path + ".tmp";
+    FILE* file = fopen(temp_file_path.c_str(), "wb");
+    if (!file)
+    {
+        NCNN_LOGE("Failed to open temporary file %s for writing pipeline cache", temp_file_path.c_str());
+        return -1;
+    }
+
+    size_t header_bytes_written = fwrite(&header, 1, sizeof(PipelineCachePrivate::pipeline_cache_prefix_header), file);
+    size_t data_bytes_written = fwrite(buffer.data(), 1, cache_data_size, file);
+    if (header_bytes_written != sizeof(PipelineCachePrivate::pipeline_cache_prefix_header) || data_bytes_written != cache_data_size)
+    {
+        NCNN_LOGE("Failed to write pipeline cache data to file %s", temp_file_path.c_str());
+        fclose(file);
+        return -1;
+    }
+
+    fclose(file);
+
+    if (atomic_rename(temp_file_path.c_str(), expected_path.c_str()) != 0)
+    {
+        NCNN_LOGE("Failed to rename file %s to %s", temp_file_path.c_str(), path);
+        return -1;
+    }
+
+    return 0;
+}
+
 int PipelineCache::create_shader_module(int shader_type_index, const Option& opt,
                                         uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
                                         VkShaderModule* _shader_module, ShaderInfo& si) const
 {
+    const uint32_t* spv_data = nullptr;
+    size_t spv_data_size = 0;
+    uint64_t key = shader_spv_key(shader_type_index, opt);
+
     std::vector<uint32_t> spirv;
-    int retc = compile_spirv_module(shader_type_index, opt, spirv);
-    if (retc != 0)
+    if (d->spv_code_cache.find(key) != d->spv_code_cache.end() || d->load_spv_code_cache_from_disk(*vkdev, key) == 0)
     {
-        NCNN_LOGE("compile_spirv_module failed %d", retc);
-        return -1;
+        const std::vector<uint32_t>& spirv_cache = d->spv_code_cache[key];
+        spv_data = spirv_cache.data();
+        spv_data_size = spirv_cache.size() * 4;
     }
+    else
+    {
+        int retc = compile_spirv_module(shader_type_index, opt, spirv);
+        if (retc != 0)
+        {
+            NCNN_LOGE("compile_spirv_module failed");
+            return -1;
+        }
 
-    const uint32_t* spv_data = spirv.data();
-    size_t spv_data_size = spirv.size() * 4;
+        d->spv_code_cache[key] = spirv;
+        int ret = d->save_spv_code_cache_to_disk(key, *vkdev, spirv);
+        if (ret != 0)
+        {
+            NCNN_LOGE("save_spv_code_cache_to_disk failed");
+        }
+
+        spv_data = spirv.data();
+        spv_data_size = spirv.size() * 4;
+    }
 
     int ret = resolve_shader_info(spv_data, spv_data_size, si);
     if (ret != 0)
@@ -445,7 +958,13 @@ int PipelineCache::new_pipeline(VkShaderModule shader_module, const ShaderInfo&
     if (ret != 0)
         goto ERROR_PipelineCache;
 
-    ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline);
+    if (!d->pipeline_cache)
+    {
+        ret = vkdev->create_empty_pipeline_cache(&d->pipeline_cache);
+        if (ret != 0)
+            NCNN_LOGE("vkdev->create_empty_pipeline_cache failed, don't use cache");
+    }
+    ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &d->pipeline_cache, &pipeline);
     if (ret != 0)
         goto ERROR_PipelineCache;
 
@@ -491,6 +1010,76 @@ int PipelineCache::new_pipeline(VkShaderModule shader_module, const ShaderInfo&
     return -1;
 }
 
+void PipelineCache::set_shader_cache_dir(const char* dir)
+{
+    MutexLockGuard lock(d->cache_lock);
+    d->shader_cache_dir = dir;
+}
+
+static bool clear_directory(const std::string& path)
+{
+#ifdef _WIN32
+    WIN32_FIND_DATAA findData;
+    HANDLE hFind = FindFirstFileA((path + "\\*").c_str(), &findData);
+    if (hFind == INVALID_HANDLE_VALUE) return false;
+
+    do {
+        std::string name = findData.cFileName;
+        if (name == "." || name == "..") continue;
+
+        std::string fullPath = path + "\\" + name;
+        if (findData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
+        {
+            clear_directory(fullPath);
+            RemoveDirectoryA(fullPath.c_str());
+        }
+        else
+        {
+            DeleteFileA(fullPath.c_str());
+        }
+    } while (FindNextFileA(hFind, &findData));
+
+    FindClose(hFind);
+    return true;
+#else
+    DIR* dir = opendir(path.c_str());
+    if (!dir) return false;
+
+    struct dirent* entry;
+    while ((entry = readdir(dir)) != nullptr)
+    {
+        std::string name = entry->d_name;
+        if (name == "." || name == "..") continue;
+
+        std::string fullPath = path + "/" + name;
+        struct stat st;
+        if (stat(fullPath.c_str(), &st) == 0)
+        {
+            if (S_ISDIR(st.st_mode))
+            {
+                clear_directory(fullPath);
+                rmdir(fullPath.c_str());
+            }
+            else
+            {
+                unlink(fullPath.c_str());
+            }
+        }
+    }
+    closedir(dir);
+    return true;
+#endif
+}
+
+int PipelineCache::clear_shader_cache() const
+{
+    MutexLockGuard lock(d->cache_lock);
+    d->spv_code_cache.clear();
+
+    if (clear_directory(d->shader_cache_dir)) return 0;
+    return -1;
+}
+
 #endif // NCNN_VULKAN
 
 } // namespace ncnn
diff --git a/src/pipelinecache.h b/src/pipelinecache.h
index b93c0cfd8f0..f4a58efbd72 100644
--- a/src/pipelinecache.h
+++ b/src/pipelinecache.h
@@ -24,6 +24,10 @@ class NCNN_EXPORT PipelineCache
 
     void clear();
 
+    void set_shader_cache_dir(const char* dir);
+
+    int clear_shader_cache() const;
+
     int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
                      uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t subgroup_size,
                      VkShaderModule* shader_module,
@@ -42,6 +46,10 @@ class NCNN_EXPORT PipelineCache
                      VkDescriptorUpdateTemplateKHR* descriptor_update_template,
                      ShaderInfo& shader_info) const;
 
+    int load_pipeline_cache(const char* path) const;
+
+    int save_pipeline_cache(const char* path) const;
+
 protected:
     int create_shader_module(int shader_type_index, const Option& opt,
                              uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
diff --git a/src/simplevk.h b/src/simplevk.h
index c2c7060dd1d..7507b15be42 100644
--- a/src/simplevk.h
+++ b/src/simplevk.h
@@ -1097,6 +1097,12 @@ typedef enum VkCommandBufferResetFlagBits
 } VkCommandBufferResetFlagBits;
 typedef VkFlags VkCommandBufferResetFlags;
 
+typedef enum VkPipelineCacheCreateFlagBits {
+    VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT = 0x00000001,
+    VK_PIPELINE_CACHE_CREATE_INTERNALLY_SYNCHRONIZED_MERGE_BIT_KHR = 0x00000008,
+    VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT_EXT = VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT,
+} VkPipelineCacheCreateFlagBits;
+
 typedef struct VkApplicationInfo
 {
     VkStructureType sType;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5a0940e88c6..86e6203e2a2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -65,6 +65,7 @@ ncnn_add_test(paramdict)
 
 if(NCNN_VULKAN)
     ncnn_add_test(command)
+    ncnn_add_test(pipeline_cache)
 endif()
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
diff --git a/tests/test_pipeline_cache.cpp b/tests/test_pipeline_cache.cpp
new file mode 100644
index 00000000000..ed0a7dcf014
--- /dev/null
+++ b/tests/test_pipeline_cache.cpp
@@ -0,0 +1,366 @@
+#include "benchmark.h"
+#include "testutil.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "gpu.h"
+#include "pipelinecache.h"
+
+#include <future>
+#include <stdio.h>
+#ifdef _WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+void random_truncate_file(const char* filename, size_t new_size)
+{
+    FILE* fp = fopen(filename, "rb+");
+    if (!fp) return;
+#ifdef _WIN32
+    int fd = _fileno(fp);
+    _chsize(fd, new_size);
+#else
+    int fd = fileno(fp);
+    ftruncate(fd, new_size);
+#endif
+    fclose(fp);
+}
+
+void corrupt_file(const char* filename)
+{
+    int mode = RandomInt(0, 10000) % 3;
+    if (mode == 0)
+    {
+        if (remove(filename) != 0)
+            fprintf(stderr, "Failed to remove file %s\n", filename);
+        return;
+    }
+    if (mode == 1)
+    {
+        // empty file
+        FILE* f = fopen(filename, "wb");
+        if (!f) return;
+        fclose(f);
+        return;
+    }
+    // truncate to random size between 1 and original file size
+    FILE* fp = fopen(filename, "rb");
+    if (!fp) return;
+    fseek(fp, 0, SEEK_END);
+    long file_size = ftell(fp);
+    fclose(fp);
+
+    size_t new_size = (size_t)(RandomInt(0, 10000) % file_size + 1);
+    random_truncate_file(filename, new_size);
+}
+
+bool test_pipeline_creation(const ncnn::Option& opt, double* build_time = nullptr, int layer_type_index = 0)
+{
+    const ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(0);
+    ncnn::Pipeline pipeline(vkdev);
+    double start = ncnn::get_current_time();
+    int ret = pipeline.create(0, opt, std::vector<ncnn::vk_specialization_type> {1});
+    double end = ncnn::get_current_time();
+    if (build_time) *build_time = end - start;
+    if (ret != 0) return false;
+    return true;
+}
+
+bool pipeline_cache_test_basic_creation()
+{
+    fprintf(stdout, "Start basic test\n");
+    ncnn::create_gpu_instance();
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(0);
+    const int options[][6] = {
+        {0, 0, 0, 0, 0, 0},
+    };
+
+    ncnn::Option opt{};
+    opt.num_threads = 1;
+    opt.use_packing_layout = options[0][0];
+    opt.use_fp16_packed = options[0][1];
+    opt.use_fp16_storage = options[0][2];
+    opt.use_fp16_arithmetic = options[0][3];
+    opt.use_bf16_storage = options[0][4];
+    opt.use_shader_pack8 = options[0][5];
+
+    double duration_1;
+    if (vkdev->get_pipeline_cache()->clear_shader_cache() != 0)
+    {
+        fprintf(stderr, "clear shader cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    if (!test_pipeline_creation(opt, &duration_1))
+    {
+        fprintf(stderr, "pipeline creation without cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    fprintf(stdout, "pipeline cache test creation time (without cache): %.2f ms\n", duration_1);
+    if (vkdev->get_pipeline_cache()->save_pipeline_cache("vk_pipeline_cache") != 0)
+    {
+        fprintf(stderr, "save pipeline cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+
+    ncnn::destroy_gpu_instance();
+
+    ncnn::create_gpu_instance();
+
+    int ret = ncnn::get_gpu_device(0)->get_pipeline_cache()->load_pipeline_cache("vk_pipeline_cache");
+    if (ret != 0)
+    {
+        fprintf(stderr, "load pipeline cache failed\n");
+        return false;
+    }
+    double duration_2;
+    if (!test_pipeline_creation(opt, &duration_2))
+    {
+        fprintf(stderr, "pipeline creation without cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    fprintf(stdout, "pipeline cache test creation time (with cache): %.2f ms\n", duration_2);
+    remove("vk_pipeline_cache");
+    ncnn::destroy_gpu_instance();
+    return true;
+}
+
+bool pipeline_cache_test_corrupted_cache_file()
+{
+    fprintf(stdout, "Start file corruption test\n");
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(0);
+    // first create and save cache file
+    ncnn::create_gpu_instance();
+    const int options[][6] = {
+        {0, 0, 0, 0, 0, 0},
+    };
+
+    ncnn::Option opt{};
+    opt.num_threads = 1;
+    opt.use_packing_layout = options[0][0];
+    opt.use_fp16_packed = options[0][1];
+    opt.use_fp16_storage = options[0][2];
+    opt.use_fp16_arithmetic = options[0][3];
+    opt.use_bf16_storage = options[0][4];
+    opt.use_shader_pack8 = options[0][5];
+
+    if (vkdev->get_pipeline_cache()->clear_shader_cache() != 0)
+    {
+        fprintf(stderr, "clear shader cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    double duration_1;
+    if (!test_pipeline_creation(opt, &duration_1))
+    {
+        fprintf(stderr, "pipeline creation without cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+
+    fprintf(stdout, "pipeline cache test creation time (without cache): %.2f ms\n", duration_1);
+    if (vkdev->get_pipeline_cache()->save_pipeline_cache("vk_pipeline_cache") != 0)
+    {
+        fprintf(stderr, "save pipeline cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    ncnn::destroy_gpu_instance();
+    corrupt_file("vk_pipeline_cache");
+    ncnn::create_gpu_instance();
+    int ret = ncnn::get_gpu_device(0)->get_pipeline_cache()->load_pipeline_cache("vk_pipeline_cache");
+    if (ret)
+    {
+        fprintf(stderr, "load cache after file corruption failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    double duration_2;
+    if (!test_pipeline_creation(opt, &duration_2))
+    {
+        fprintf(stderr, "pipeline creation after cache corruption failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    fprintf(stdout, "pipeline cache test creation time (after cache corruption): %.2f ms\n", duration_2);
+    remove("vk_pipeline_cache");
+    ncnn::destroy_gpu_instance();
+    return true;
+}
+
+bool pipeline_cache_test_multithread_creation()
+{
+    fprintf(stdout, "Start multi-thread test\n");
+
+    ncnn::create_gpu_instance();
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(0);
+
+    ncnn::Option opt{};
+    opt.num_threads = 1;
+    opt.use_packing_layout = 0;
+    opt.use_fp16_packed = 1;
+    opt.use_fp16_storage = 0;
+    opt.use_fp16_arithmetic = 0;
+    opt.use_bf16_storage = 1;
+    opt.use_shader_pack8 = 0;
+
+    if (vkdev->get_pipeline_cache()->clear_shader_cache() != 0)
+    {
+        fprintf(stderr, "clear shader cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    double duration;
+    if (!test_pipeline_creation(opt, &duration))
+    {
+        fprintf(stderr, "pipeline creation failed before multi-thread test\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    if (vkdev->get_pipeline_cache()->save_pipeline_cache("vk_pipeline_cache") != 0)
+    {
+        fprintf(stderr, "save pipeline cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+    ncnn::destroy_gpu_instance();
+
+    ncnn::create_gpu_instance();
+    vkdev = ncnn::get_gpu_device(0);
+    if (vkdev->get_pipeline_cache()->load_pipeline_cache("vk_pipeline_cache") != 0)
+    {
+        fprintf(stderr, "load pipeline cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+
+    const int thread_count = 8;
+    std::vector<std::future<bool> > futures;
+    for (int i = 0; i < thread_count; i++)
+    {
+        futures.emplace_back(std::async(std::launch::async, [&opt]() {
+            return test_pipeline_creation(opt, nullptr);
+        }));
+    }
+
+    bool all_ok = true;
+    for (auto& fut : futures)
+    {
+        if (!fut.get())
+            all_ok = false;
+    }
+
+    remove("vk_pipeline_cache");
+    ncnn::destroy_gpu_instance();
+
+    if (!all_ok)
+    {
+        fprintf(stderr, "multi-thread pipeline creation failed\n");
+        return false;
+    }
+
+    fprintf(stdout, "multi-thread pipeline creation passed\n");
+    return true;
+}
+
+bool pipeline_cache_test_multithread_save()
+{
+    fprintf(stdout, "Start multi-thread save test\n");
+
+    ncnn::create_gpu_instance();
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(0);
+
+    ncnn::Option opt{};
+    opt.num_threads = 1;
+    opt.use_packing_layout = 0;
+    opt.use_fp16_packed = 0;
+    opt.use_fp16_storage = 0;
+    opt.use_fp16_arithmetic = 0;
+    opt.use_bf16_storage = 0;
+    opt.use_shader_pack8 = 0;
+
+    if (vkdev->get_pipeline_cache()->clear_shader_cache() != 0)
+    {
+        fprintf(stderr, "clear shader cache failed\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+
+    if (!test_pipeline_creation(opt, nullptr))
+    {
+        fprintf(stderr, "pipeline creation failed before multi-thread save test\n");
+        ncnn::destroy_gpu_instance();
+        return false;
+    }
+
+    const int thread_count = 8;
+    std::vector<std::future<int> > futures;
+    for (int i = 0; i < thread_count; i++)
+    {
+        futures.emplace_back(std::async(std::launch::async, [vkdev]() {
+            return vkdev->get_pipeline_cache()->save_pipeline_cache("vk_pipeline_cache");
+        }));
+    }
+
+    bool all_ok = true;
+    for (auto& fut : futures)
+    {
+        if (fut.get() != 0)
+            all_ok = false;
+    }
+
+    ncnn::destroy_gpu_instance();
+
+    if (!all_ok)
+    {
+        fprintf(stderr, "multi-thread save_pipeline_cache had errors\n");
+        return false;
+    }
+
+    ncnn::create_gpu_instance();
+    vkdev = ncnn::get_gpu_device(0);
+    int ret = vkdev->get_pipeline_cache()->load_pipeline_cache("vk_pipeline_cache");
+    remove("vk_pipeline_cache");
+    ncnn::destroy_gpu_instance();
+
+    if (ret != 0)
+    {
+        fprintf(stderr, "cache file after multi-thread save is invalid\n");
+        return false;
+    }
+
+    fprintf(stdout, "multi-thread save_pipeline_cache passed\n");
+    return true;
+}
+
+int main()
+{
+    SRAND(7767517);
+    if (!pipeline_cache_test_basic_creation())
+    {
+        fprintf(stderr, "pipeline cache basic test failed\n");
+        return -1;
+    }
+    if (!pipeline_cache_test_corrupted_cache_file())
+    {
+        fprintf(stderr, "pipeline cache corrupted file test failed\n");
+        return -1;
+    }
+    if (!pipeline_cache_test_multithread_creation())
+    {
+        fprintf(stderr, "pipeline cache multi-thread creation test failed\n");
+        return -1;
+    }
+    if (!pipeline_cache_test_multithread_save())
+    {
+        fprintf(stderr, "pipeline cache multi-thread save test failed\n");
+        return -1;
+    }
+    fprintf(stdout, "All pipeline cache tests passed\n");
+    return 0;
+}
\ No newline at end of file