From aa84b3a6868bc509dbd272377ef10d12eb949cef Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 24 Feb 2025 19:43:10 -0800 Subject: [PATCH 1/3] [ET-VK] Adding a workgroup class to VecUtils Pull Request resolved: https://github.com/pytorch/executorch/pull/8632 This diff adds a new class called `WorkgroupSize` to the `VecUtils` header file. The `WorkgroupSize` class takes three `uint32_t` values as parameters and stores them in a single `uint32_t` variable using bitwise operations. This class is used in the Vulkan backend to specify the size of a workgroup for a given operation. ghstack-source-id: 268172661 @exported-using-ghexport Differential Revision: [D70021019](https://our.internmc.facebook.com/intern/diff/D70021019/) --- backends/vulkan/runtime/utils/VecUtils.h | 44 ++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h index ad4434cf5af..c084a563544 100644 --- a/backends/vulkan/runtime/utils/VecUtils.h +++ b/backends/vulkan/runtime/utils/VecUtils.h @@ -479,5 +479,49 @@ inline int64_t multiply_integers(Iter begin, Iter end) { begin, end, static_cast(1), std::multiplies<>()); } +class WorkgroupSize final { + uint32_t val; + + public: + explicit WorkgroupSize() : val(0) {} + explicit WorkgroupSize(const uint32_t x, const uint32_t y, const uint32_t z) { + // shift numbers by multiple of 11 bits, since each local workgroup axis can + // be 1024 at most and which is 0x400. only z axis can't store 1024, because + // it would overflow uint32_t storage. + if (z == 1024) { + throw std::runtime_error( + "Workgroup size in z axis cannot be 1024 because it would overflow uint32_t storage"); + } + val = x | (y << 11) | (z << 22); + } + + explicit WorkgroupSize(const uvec3& vec) { + // shift numbers by multiple of 11 bits, since each local workgroup axis can + // be 1024 at most and which is 0x400. only z axis can't store 1024, because + // it would overflow uint32_t storage. + if (vec[2u] == 1024) { + throw std::runtime_error( + "Workgroup size in z axis cannot be 1024 because it would overflow uint32_t storage"); + } + val = vec[0u] | (vec[1u] << 11) | (vec[2u] << 22); + } + + explicit inline operator uvec3() const { + return { + val & 0x7ffu, + (val >> 11) & 0x7ffu, + (val >> 22), + }; + } + + explicit inline operator uint32_t() const { + return val; + } + + inline constexpr uint32_t operator[](const int idx) const { + return (val >> (11 * idx)) & 0x7ffu; + } +}; + } // namespace utils } // namespace vkcompute From 64f426cb1114639e06fb68919144901b3f797270 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 24 Feb 2025 19:43:11 -0800 Subject: [PATCH 2/3] [ET-VK] Adding reserve and append functions to SpecVarList Pull Request resolved: https://github.com/pytorch/executorch/pull/8633 This diff adds two new functions to the SpecVarList class in the Vulkan runtime library. The first function, reserve, allows the user to reserve a certain amount of space in the SpecVarList before adding any elements. The second function, append, allows the user to add a single SpecVar to the SpecVarList. These functions are useful for optimizing memory usage and improving performance in the Vulkan runtime. ghstack-source-id: 268172659 @exported-using-ghexport Differential Revision: [D70021782](https://our.internmc.facebook.com/intern/diff/D70021782/) --- backends/vulkan/runtime/vk_api/Pipeline.cpp | 8 ++++++++ backends/vulkan/runtime/vk_api/Pipeline.h | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp index 0c66a085ad9..51b59ed4d1f 100644 --- a/backends/vulkan/runtime/vk_api/Pipeline.cpp +++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp @@ -174,6 +174,14 @@ void SpecVarList::append(const SpecVarList& other) { vars.insert(vars.end(), other.vars.begin(), other.vars.end()); } +void SpecVarList::reserve(const size_t size) { + vars.reserve(size); +} + +void SpecVarList::append(const SpecVar& other) { + vars.push_back(other); +} + std::vector SpecVarList::generate_map_entries() const { std::vector map_entries; diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h index 5460a0acba7..b9f4e3d2a35 100644 --- a/backends/vulkan/runtime/vk_api/Pipeline.h +++ b/backends/vulkan/runtime/vk_api/Pipeline.h @@ -82,6 +82,10 @@ class SpecVarList final { void append(const SpecVarList& other); + void reserve(const size_t size); + + void append(const SpecVar& other); + std::vector generate_map_entries() const; friend bool operator==(const SpecVarList& lhs, const SpecVarList& rhs); From 7ca2a25c09d80d01c0f0d40437d3c48c473a2576 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 24 Feb 2025 19:43:12 -0800 Subject: [PATCH 3/3] [ET-VK] Replacing the use of uvec3 with WorkgroupSize class to reduce memory usage and improve processing speed Pull Request resolved: https://github.com/pytorch/executorch/pull/8634 This diff replaces the use of `uvec3` with `WorkgroupSize` class to reduce memory usage and improve processing speed in the Vulkan backend of Executorch. ghstack-source-id: 268172660 @exported-using-ghexport Differential Revision: [D70021032](https://our.internmc.facebook.com/intern/diff/D70021032/) --- backends/vulkan/runtime/api/Context.cpp | 16 +++++----------- backends/vulkan/runtime/api/Context.h | 14 +++++++++----- .../vulkan/runtime/graph/ops/BlitNode.cpp | 2 +- .../vulkan/runtime/graph/ops/DispatchNode.h | 2 +- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 4 ++-- .../vulkan/runtime/graph/ops/PrepackNode.h | 2 +- backends/vulkan/runtime/vk_api/Command.cpp | 2 +- backends/vulkan/runtime/vk_api/Command.h | 6 +++--- backends/vulkan/runtime/vk_api/Pipeline.cpp | 19 ++++++++++++++----- backends/vulkan/runtime/vk_api/Pipeline.h | 4 ++++ backends/vulkan/runtime/vk_api/Shader.h | 2 +- 11 files changed, 42 insertions(+), 31 deletions(-) diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp index 8178ada3a45..64f32e50f4e 100644 --- a/backends/vulkan/runtime/api/Context.cpp +++ b/backends/vulkan/runtime/api/Context.cpp @@ -74,7 +74,7 @@ void Context::cmd_reset_querypool() { void Context::report_shader_dispatch_start( const std::string& shader_name, const utils::uvec3& global_wg_size, - const utils::uvec3& local_wg_size, + const utils::WorkgroupSize& local_wg_size, const uint32_t dispatch_id) { if (querypool_) { querypool_.shader_profile_begin( @@ -82,7 +82,7 @@ void Context::report_shader_dispatch_start( dispatch_id, shader_name, vkapi::create_extent3d(global_wg_size), - vkapi::create_extent3d(local_wg_size)); + vkapi::create_extent3d((utils::uvec3)local_wg_size)); } } @@ -115,7 +115,7 @@ void Context::check_device_capabilities(const vkapi::ShaderInfo& shader) { vkapi::DescriptorSet Context::get_descriptor_set( const vkapi::ShaderInfo& shader_descriptor, - const utils::uvec3& local_workgroup_size, + const utils::WorkgroupSize& local_workgroup_size, const vkapi::SpecVarList& additional_constants, const uint32_t push_constants_size) { VkDescriptorSetLayout shader_layout = @@ -124,17 +124,11 @@ vkapi::DescriptorSet Context::get_descriptor_set( VkPipelineLayout pipeline_layout = pipeline_layout_cache().retrieve(shader_layout, push_constants_size); - vkapi::SpecVarList spec_constants = { - SV(local_workgroup_size[0u]), - SV(local_workgroup_size[1u]), - SV(local_workgroup_size[2u])}; - - spec_constants.append(additional_constants); - VkPipeline pipeline = pipeline_cache().retrieve( {pipeline_layout_cache().retrieve(shader_layout, push_constants_size), shader_cache().retrieve(shader_descriptor), - spec_constants}); + additional_constants, + local_workgroup_size}); cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size); diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h index 8bbcf79b45c..6cfbc64f141 100644 --- a/backends/vulkan/runtime/api/Context.h +++ b/backends/vulkan/runtime/api/Context.h @@ -11,6 +11,7 @@ // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName #include +#include #include #include @@ -150,7 +151,7 @@ class Context final { void report_shader_dispatch_start( const std::string& shader_name, const utils::uvec3& global_wg_size, - const utils::uvec3& local_wg_size, + const utils::WorkgroupSize& local_wg_size, const uint32_t dispatch_id = UINT32_MAX); /* @@ -189,13 +190,13 @@ class Context final { vkapi::DescriptorSet get_descriptor_set( const vkapi::ShaderInfo&, - const utils::uvec3&, + const utils::WorkgroupSize&, const vkapi::SpecVarList&, const uint32_t push_constants_size); inline vkapi::DescriptorSet get_descriptor_set( const vkapi::ShaderInfo& shader_descriptor, - const utils::uvec3& local_work_group_size) { + const utils::WorkgroupSize& local_work_group_size) { return get_descriptor_set(shader_descriptor, local_work_group_size, {}, 0u); } @@ -362,14 +363,17 @@ inline bool Context::submit_compute_job( report_shader_dispatch_start( shader.kernel_name, global_work_group, - local_work_group_size, + utils::WorkgroupSize(local_work_group_size), dispatch_id); // Factor out template parameter independent code to minimize code bloat. // Note that push constants are not exposed yet via this API, therefore the // push constants size is assumed to be 0. vkapi::DescriptorSet descriptor_set = get_descriptor_set( - shader, local_work_group_size, specialization_constants, 0u); + shader, + utils::WorkgroupSize(local_work_group_size), + specialization_constants, + 0u); detail::bind( descriptor_set, diff --git a/backends/vulkan/runtime/graph/ops/BlitNode.cpp b/backends/vulkan/runtime/graph/ops/BlitNode.cpp index 463a2d19c36..03ee4caa51a 100644 --- a/backends/vulkan/runtime/graph/ops/BlitNode.cpp +++ b/backends/vulkan/runtime/graph/ops/BlitNode.cpp @@ -46,7 +46,7 @@ void BlitNode::encode(ComputeGraph* graph) { kernel_name += vkapi::to_string(dst_tensor->dtype()); context->report_shader_dispatch_start( - kernel_name, utils::uvec3(), utils::uvec3(), node_id_); + kernel_name, utils::uvec3(), utils::WorkgroupSize(), node_id_); context->register_blit( pipeline_barrier, diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h index 7d04f7714e9..4661b5bf9cf 100644 --- a/backends/vulkan/runtime/graph/ops/DispatchNode.h +++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h @@ -92,7 +92,7 @@ class DispatchNode final : public ExecuteNode { protected: const vkapi::ShaderInfo shader_; const utils::uvec3 global_workgroup_size_; - const utils::uvec3 local_workgroup_size_; + const utils::WorkgroupSize local_workgroup_size_; const vkapi::ParamsBindList params_; const vkapi::SpecVarList spec_vars_; const std::vector push_constants_; diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index bf501296b1b..0507b679e13 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -100,8 +100,8 @@ void PrepackNode::encode(ComputeGraph* graph) { // bound with the correct image layout. { vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::DescriptorSet descriptor_set = - context->get_descriptor_set(noop_shader_, {1, 1, 1}); + vkapi::DescriptorSet descriptor_set = context->get_descriptor_set( + noop_shader_, utils::WorkgroupSize(1, 1, 1)); bind_tensor_to_descriptor_set( *packed, diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index 3e713303c3d..2d194e7f6a0 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -49,7 +49,7 @@ class PrepackNode final { const vkapi::ShaderInfo shader_; vkapi::ShaderInfo noop_shader_; const utils::uvec3 global_workgroup_size_; - const utils::uvec3 local_workgroup_size_; + const utils::WorkgroupSize local_workgroup_size_; const ValueRef tref_; const ValueRef packed_; const vkapi::ParamsBindList params_; diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp index 3be790b53cf..3a5041f9500 100644 --- a/backends/vulkan/runtime/vk_api/Command.cpp +++ b/backends/vulkan/runtime/vk_api/Command.cpp @@ -81,7 +81,7 @@ void CommandBuffer::end() { void CommandBuffer::bind_pipeline( VkPipeline pipeline, VkPipelineLayout pipeline_layout, - const utils::uvec3 local_workgroup_size) { + const utils::WorkgroupSize local_workgroup_size) { VK_CHECK_COND( state_ == CommandBuffer::State::RECORDING, "Vulkan CommandBuffer: called bind_pipeline() on a command buffer whose state " diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h index 99cd5d17c99..ff1e5934a5c 100644 --- a/backends/vulkan/runtime/vk_api/Command.h +++ b/backends/vulkan/runtime/vk_api/Command.h @@ -51,7 +51,7 @@ class CommandBuffer final { struct Bound { VkPipeline pipeline; VkPipelineLayout pipeline_layout; - utils::uvec3 local_workgroup_size; + utils::WorkgroupSize local_workgroup_size; VkDescriptorSet descriptors; explicit Bound() @@ -63,7 +63,7 @@ class CommandBuffer final { inline void reset() { pipeline = VK_NULL_HANDLE; pipeline_layout = VK_NULL_HANDLE; - local_workgroup_size = {0u, 0u, 0u}; + local_workgroup_size = utils::WorkgroupSize{0u, 0u, 0u}; descriptors = VK_NULL_HANDLE; } }; @@ -87,7 +87,7 @@ class CommandBuffer final { void begin(); void end(); - void bind_pipeline(VkPipeline, VkPipelineLayout, const utils::uvec3); + void bind_pipeline(VkPipeline, VkPipelineLayout, const utils::WorkgroupSize); void bind_descriptors(VkDescriptorSet); void set_push_constants(VkPipelineLayout, const void*, uint32_t); diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp index 51b59ed4d1f..b5ee47cd2c7 100644 --- a/backends/vulkan/runtime/vk_api/Pipeline.cpp +++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp @@ -275,14 +275,23 @@ ComputePipeline::ComputePipeline( const ComputePipeline::Descriptor& descriptor, VkPipelineCache pipeline_cache) : device_(device), handle_{VK_NULL_HANDLE} { - std::vector map_entries = - descriptor.specialization_constants.generate_map_entries(); + SpecVarList specialization_constants; + + specialization_constants.reserve( + 3 + descriptor.specialization_constants.size()); + specialization_constants.append(descriptor.local_wg_size[0]); + specialization_constants.append(descriptor.local_wg_size[1]); + specialization_constants.append(descriptor.local_wg_size[2]); + + specialization_constants.append(descriptor.specialization_constants); + const std::vector map_entries = + specialization_constants.generate_map_entries(); const VkSpecializationInfo specialization_info{ - descriptor.specialization_constants.size(), // mapEntryCount + specialization_constants.size(), // mapEntryCount map_entries.data(), // pMapEntries - descriptor.specialization_constants.data_nbytes(), // dataSize - descriptor.specialization_constants.data(), // pData + specialization_constants.data_nbytes(), // dataSize + specialization_constants.data(), // pData }; const VkPipelineShaderStageCreateInfo shader_stage_create_info{ diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h index b9f4e3d2a35..3248051d12a 100644 --- a/backends/vulkan/runtime/vk_api/Pipeline.h +++ b/backends/vulkan/runtime/vk_api/Pipeline.h @@ -156,6 +156,7 @@ class ComputePipeline final { VkPipelineLayout pipeline_layout; VkShaderModule shader_module; SpecVarList specialization_constants; + utils::WorkgroupSize local_wg_size; }; explicit ComputePipeline( @@ -273,6 +274,9 @@ class ComputePipelineCache final { seed = utils::hash_combine(seed, new_seed); } + seed = utils::hash_combine( + seed, std::hash()((uint32_t)descriptor.local_wg_size)); + return seed; } }; diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h index d9fec65febc..7d0fa7b7476 100644 --- a/backends/vulkan/runtime/vk_api/Shader.h +++ b/backends/vulkan/runtime/vk_api/Shader.h @@ -61,7 +61,7 @@ struct ShaderInfo final { ShaderLayout::Signature kernel_layout{}; // Shader Metadata - utils::uvec3 out_tile_size{1u, 1u, 1u}; + utils::WorkgroupSize out_tile_size{1u, 1u, 1u}; bool requires_shader_int16 = false; bool requires_16bit_storage = false; bool requires_8bit_storage = false;