pytorch
diff --git a/‎.github/workflows/android-perf.yml
+4-2 b/‎.github/workflows/android-perf.yml
+4-2
diff --git a/‎.github/workflows/apple-perf.yml
+4-2 b/‎.github/workflows/apple-perf.yml
+4-2
diff --git a/‎backends/cadence/aot/compiler.py
-16 b/‎backends/cadence/aot/compiler.py
-16
diff --git a/‎backends/cadence/aot/compiler_utils.py
+3-3 b/‎backends/cadence/aot/compiler_utils.py
+3-3
diff --git a/‎backends/cadence/aot/reorder_ops.py
+3-3 b/‎backends/cadence/aot/reorder_ops.py
+3-3
diff --git a/‎backends/cadence/aot/utils.py
-8 b/‎backends/cadence/aot/utils.py
-8
diff --git a/‎backends/vulkan/runtime/api/Context.cpp
+5-4 b/‎backends/vulkan/runtime/api/Context.cpp
+5-4
diff --git a/‎backends/vulkan/runtime/api/Context.h
+6-3 b/‎backends/vulkan/runtime/api/Context.h
+6-3
diff --git a/‎backends/vulkan/runtime/api/containers/ParamsBuffer.h
+3-2 b/‎backends/vulkan/runtime/api/containers/ParamsBuffer.h
+3-2
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.cpp
+27-16 b/‎backends/vulkan/runtime/api/containers/Tensor.cpp
+27-16
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.h
+7-10 b/‎backends/vulkan/runtime/api/containers/Tensor.h
+7-10
diff --git a/‎backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+12-11 b/‎backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+12-11
diff --git a/‎backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+2-2 b/‎backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+2-2
@@ -98,6 +98,7 @@ jobs:
       - uses: actions/checkout@v3
 
       - name: Prepare the spec
+        id: prepare
         shell: bash
         env:
           BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
@@ -111,7 +112,7 @@ jobs:
           # so let's just sed it
           sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
 
-          BENCHMARK_CONFIG_ID="${{ matrix.model }}_${{ matrix.config }}"
+          BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
           # The config for this benchmark runs, we save it in the test spec so that it can be fetched
           # later by the upload script
           sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' android-llm-device-farm-test-spec.yml.j2
@@ -122,6 +123,7 @@ jobs:
 
           # Save the benchmark configs so that we can use it later in the dashboard
           echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
+          echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
 
       - name: Upload the spec
         uses: seemethere/upload-artifact-s3@v5
@@ -141,7 +143,7 @@ jobs:
             ${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
           retention-days: 1
           if-no-files-found: error
-          path: extension/benchmark/android/benchmark/${{ matrix.model }}_${{ matrix.config }}.json
+          path: extension/benchmark/android/benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
 
   export-models:
     name: export-models
 
@@ -100,6 +100,7 @@ jobs:
       - uses: actions/checkout@v3
 
       - name: Prepare the spec
+        id: prepare
         shell: bash
         env:
           BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
@@ -113,7 +114,7 @@ jobs:
           # so let's just sed it
           sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' default-ios-device-farm-appium-test-spec.yml.j2
 
-          BENCHMARK_CONFIG_ID="${{ matrix.model }}_${{ matrix.config }}"
+          BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
           # The config for this benchmark runs, we save it in the test spec so that it can be fetched
           # later by the upload script
           sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' default-ios-device-farm-appium-test-spec.yml.j2
@@ -124,6 +125,7 @@ jobs:
 
           # Save the benchmark configs so that we can use it later in the dashboard
           echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
+          echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
 
       - name: Upload the spec
         uses: seemethere/upload-artifact-s3@v5
@@ -143,7 +145,7 @@ jobs:
             ${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
           retention-days: 1
           if-no-files-found: error
-          path: extension/benchmark/apple/Benchmark/${{ matrix.model }}_${{ matrix.config }}.json
+          path: extension/benchmark/apple/Benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
 
   export-models:
     name: export-models
 
@@ -18,17 +18,11 @@
 )
 from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
-
-from executorch.backends.cadence.aot.replace_ops import ReplaceSafeSoftmaxWithSoftmax
 from executorch.backends.cadence.aot.utils import (
     get_default_memory_config,
     MemoryConfig,
-    model_gm_has_SDPA,
     model_is_quantized,
 )
-from executorch.backends.transforms.decompose_sdpa import (
-    DecomposeScaledDotProductAttention,
-)
 from executorch.devtools import generate_etrecord
 from executorch.exir import (
     EdgeCompileConfig,
@@ -91,16 +85,6 @@ def convert_pt2(
         .module()
     )
 
-    if model_gm_has_SDPA(model_gm):
-        # Decompose SDPA
-        DecomposeScaledDotProductAttention(False)(model_gm)
-
-        # Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882
-        # for details).
-        result = ReplaceSafeSoftmaxWithSoftmax()(model_gm)
-        assert result is not None
-        model_gm = result.graph_module
-
     # Prepare
     prepared_model = prepare_pt2e(model_gm, quantizer)
 
 
@@ -129,16 +129,16 @@ def get_transposed_dims(node: torch.fx.Node, dims: List[int]) -> List[int]:
 
 
 # Capture the effect of permute op on incoming dimension order
-def get_permuted_dims(node: torch.fx.Node, dims: Optional[List[int]]) -> List[int]:
+def get_permuted_dims(node: torch.fx.Node, dims: Optional[Sequence[int]]) -> List[int]:
     """
     Given a permute node, and the incoming dimension ordering of the input
     tensor to the permute node, return the net effect of permute op on the
     dimension order.
     """
     assert node.target == exir_ops.edge.aten.permute_copy.default
     # Permute each index of the dimension ordering (dims)
-    permute_dims = node.args[1]
-    assert isinstance(permute_dims, List)
+    # pyre-fixme[6]: This combined typecheck isn't supported yet.
+    permute_dims: List[int] = list(node.args[1])
     assert all(isinstance(x, int) for x in permute_dims)
     # If the dims is empty, we can simply return the permute order
     if not dims:
 
@@ -438,9 +438,9 @@ def postpone_dequantize_op(self, graph_module: torch.fx.GraphModule) -> bool:
                         args=(user, *node.args[1:]),
                     )
                     dequant_node.meta = user.meta.copy()
-                    # Remove meta["debug_handle"] on new node. Reassign it at the
-                    # caller level by calling generate_missing_debug_handles
-                    dequant_node.meta.pop("debug_handle")
+                    # Remove meta["debug_handle"] on new node if it exists.
+                    # Reassign it at the caller level by calling generate_missing_debug_handles
+                    dequant_node.meta.pop("debug_handle", None)
                     user.replace_all_uses_with(dequant_node)
                     dequant_node.args = (user, *node.args[1:])
 
 
@@ -235,14 +235,6 @@ def print_ops_info(
         )
 
 
-def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool:
-    for node in model_gm.graph.nodes:
-        if node.op == "call_function":
-            if node.target == torch.ops.aten.scaled_dot_product_attention.default:
-                return True
-    return False
-
-
 def save_pte_program(
     prog: ExecutorchProgramManager, model_name: str, output_dir: str = ""
 ) -> None:
 
@@ -90,12 +90,13 @@ void Context::report_shader_dispatch_end() {
 vkapi::DescriptorSet Context::get_descriptor_set(
     const vkapi::ShaderInfo& shader_descriptor,
     const utils::uvec3& local_workgroup_size,
-    const vkapi::SpecVarList& additional_constants) {
+    const vkapi::SpecVarList& additional_constants,
+    const uint32_t push_constants_size) {
   VkDescriptorSetLayout shader_layout =
       shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
 
   VkPipelineLayout pipeline_layout =
-      pipeline_layout_cache().retrieve(shader_layout);
+      pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
 
   vkapi::SpecVarList spec_constants = {
       SV(local_workgroup_size[0u]),
@@ -105,7 +106,7 @@ vkapi::DescriptorSet Context::get_descriptor_set(
   spec_constants.append(additional_constants);
 
   VkPipeline pipeline = pipeline_cache().retrieve(
-      {pipeline_layout_cache().retrieve(shader_layout),
+      {pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
        shader_cache().retrieve(shader_descriptor),
        spec_constants});
 
@@ -151,7 +152,7 @@ void Context::register_shader_dispatch(
     const VkDescriptorSetLayout shader_layout =
         shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
     const VkPipelineLayout pipeline_layout =
-        pipeline_layout_cache().retrieve(shader_layout);
+        pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
     cmd_.set_push_constants(
         pipeline_layout, push_constants_data, push_constants_size);
   }
 
@@ -188,12 +188,13 @@ class Context final {
   vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo&,
       const utils::uvec3&,
-      const vkapi::SpecVarList&);
+      const vkapi::SpecVarList&,
+      const uint32_t push_constants_size);
 
   inline vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo& shader_descriptor,
       const utils::uvec3& local_work_group_size) {
-    return get_descriptor_set(shader_descriptor, local_work_group_size, {});
+    return get_descriptor_set(shader_descriptor, local_work_group_size, {}, 0u);
   }
 
   void register_shader_dispatch(
@@ -333,8 +334,10 @@ inline bool Context::submit_compute_job(
       dispatch_id);
 
   // Factor out template parameter independent code to minimize code bloat.
+  // Note that push constants are not exposed yet via this API, therefore the
+  // push constants size is assumed to be 0.
   vkapi::DescriptorSet descriptor_set = get_descriptor_set(
-      shader, local_work_group_size, specialization_constants);
+      shader, local_work_group_size, specialization_constants, 0u);
 
   detail::bind(
       descriptor_set,
 
@@ -31,8 +31,9 @@ class ParamsBuffer final {
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}
 
-  template <typename Block>
-  ParamsBuffer(Context* context_p, const VkDeviceSize nbytes)
+  // The last bool argument, though unused, is required to disambiguate this
+  // constructor from the one above.
+  ParamsBuffer(Context* context_p, const VkDeviceSize nbytes, const bool unused)
       : context_p_(context_p),
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {}
 
@@ -658,66 +658,77 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
 }
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
+  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
   }
   if (sizes_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
+        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     sizes_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += kSizePerUniform;
+    uniforms_size_ += size_per_ubo;
     uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
   }
-  return vkapi::BufferBindInfo(uniforms_.buffer(), sizes_uniform_offset_);
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), sizes_uniform_offset_, size_per_ubo);
 }
 
 const vkapi::BufferBindInfo vTensor::strides_ubo() {
+  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
   }
   if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
+        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     unsqueezed_strides_offset_ = uniforms_size_;
-    uniforms_size_ += kSizePerUniform;
+    uniforms_size_ += size_per_ubo;
     uniforms_.update(
         utils::make_whcn_ivec4(unsqueezed_strides_),
         unsqueezed_strides_offset_);
   }
-  return vkapi::BufferBindInfo(uniforms_.buffer(), unsqueezed_strides_offset_);
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), unsqueezed_strides_offset_, size_per_ubo);
 }
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
+  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
   }
   if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
+        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     logical_limits_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += kSizePerUniform;
+    uniforms_size_ += size_per_ubo;
     uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), logical_limits_uniform_offset_);
+      uniforms_.buffer(), logical_limits_uniform_offset_, size_per_ubo);
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
+  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
   }
   if (numel_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
+        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     numel_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += kSizePerUniform;
+    uniforms_size_ += size_per_ubo;
     uniforms_.update(numel(), numel_uniform_offset_);
   }
-  return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_);
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), numel_uniform_offset_, size_per_ubo);
 }
 
 size_t vTensor::staging_buffer_numel() const {
 
@@ -348,16 +348,13 @@ class vTensor final {
   uint32_t numel_uniform_offset_;
   uint32_t logical_limits_uniform_offset_;
 
-  // Size allocated for each uniform
-  // each uniform is assumed to be a vec of 4 ints to maintain 16 byte alignemnt
-  constexpr static size_t kSizePerUniform = sizeof(utils::ivec4);
-  // Total size of tensor's uniform buffer
-  constexpr static size_t kMaxUniformBufferSize =
-      4 * // we have 4 uniforms that are passed on to shaders
-      kSizePerUniform;
-
-  // Initial value of uniform buffer offsets
-  constexpr static uint32_t kUniformOffsetUnset = kMaxUniformBufferSize;
+  // Maximum number of metadata fields that can be stored in the metadata UBO.
+  // This is used to calculate the size of the UBO that should be allocated.
+  constexpr static size_t kMaxMetadataFieldCount = 4;
+
+  // Initial value of uniform buffer offsets. 1 is selected as it is essentially
+  // impossible for a ubo to have an offset of 1.
+  constexpr static uint32_t kUniformOffsetUnset = 1;
 
   vTensorStorage storage_;
 
 
@@ -60,30 +60,31 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
+  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
+  uint32_t push_constants_offset = 0;
+
+  for (const auto& push_constant : push_constants_) {
+    push_constants_offset += push_constant.write(
+        push_constants_data.data(),
+        push_constants_offset,
+        kMaxPushConstantSize);
+  }
+
   context->report_shader_dispatch_start(
       shader_.kernel_name,
       global_workgroup_size_,
       local_workgroup_size_,
       node_id_);
 
-  vkapi::DescriptorSet descriptor_set =
-      context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
+  vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
+      shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
       graph, args_, pipeline_barrier, descriptor_set, idx);
 
   bind_params_to_descriptor_set(params_, descriptor_set, idx);
 
-  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
-  uint32_t push_constants_offset = 0;
-
-  for (const auto& push_constant : push_constants_) {
-    push_constants_offset += push_constant.write(
-        push_constants_data.data(),
-        push_constants_offset,
-        kMaxPushConstantSize);
-  }
   context->register_shader_dispatch(
       descriptor_set,
       pipeline_barrier,
 
@@ -75,8 +75,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
 
   {
     vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::DescriptorSet descriptor_set =
-        context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
+    vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
+        shader_, local_workgroup_size_, spec_vars_, 0u);
 
     uint32_t idx = 0;
     bind_tensor_to_descriptor_set(
Original file line number	Diff line number	Diff line change
`@@ -75,8 +75,8 @@ void PrepackNode::encode(ComputeGraph* graph) {`
`75`	`75`
`76`	`76`	`{`
`77`	`77`	`vkapi::PipelineBarrier pipeline_barrier{};`
`78`		`- vkapi::DescriptorSet descriptor_set =`
`79`		`- context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);`
	`78`	`+ vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(`
	`79`	`+ shader_, local_workgroup_size_, spec_vars_, 0u);`
`80`	`80`
`81`	`81`	`uint32_t idx = 0;`
`82`	`82`	`bind_tensor_to_descriptor_set(`