Skip to content

Commit a21a30a

Browse files
committed
Update on "[ET-VK] Using shared variable to store calculated output pose to free up registers and improve performance."
This diff introduces a shared variable to store calculated output pose in conv2d_pw op to free up registers and improve performance. The code changes include adding a shared variable to hold calculated positions and modifying the existing code to use the shared variable. Differential Revision: [D67742567](https://our.internmc.facebook.com/intern/diff/D67742567/) [ghstack-poisoned]
2 parents 76c2011 + f7b60e3 commit a21a30a

File tree

26 files changed

+182
-113
lines changed

26 files changed

+182
-113
lines changed

.github/workflows/android-perf.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ jobs:
9898
- uses: actions/checkout@v3
9999

100100
- name: Prepare the spec
101+
id: prepare
101102
shell: bash
102103
env:
103104
BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
@@ -111,7 +112,7 @@ jobs:
111112
# so let's just sed it
112113
sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
113114
114-
BENCHMARK_CONFIG_ID="${{ matrix.model }}_${{ matrix.config }}"
115+
BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
115116
# The config for this benchmark runs, we save it in the test spec so that it can be fetched
116117
# later by the upload script
117118
sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' android-llm-device-farm-test-spec.yml.j2
@@ -122,6 +123,7 @@ jobs:
122123
123124
# Save the benchmark configs so that we can use it later in the dashboard
124125
echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
126+
echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
125127
126128
- name: Upload the spec
127129
uses: seemethere/upload-artifact-s3@v5
@@ -141,7 +143,7 @@ jobs:
141143
${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
142144
retention-days: 1
143145
if-no-files-found: error
144-
path: extension/benchmark/android/benchmark/${{ matrix.model }}_${{ matrix.config }}.json
146+
path: extension/benchmark/android/benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
145147

146148
export-models:
147149
name: export-models

.github/workflows/apple-perf.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ jobs:
100100
- uses: actions/checkout@v3
101101

102102
- name: Prepare the spec
103+
id: prepare
103104
shell: bash
104105
env:
105106
BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
@@ -113,7 +114,7 @@ jobs:
113114
# so let's just sed it
114115
sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' default-ios-device-farm-appium-test-spec.yml.j2
115116
116-
BENCHMARK_CONFIG_ID="${{ matrix.model }}_${{ matrix.config }}"
117+
BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
117118
# The config for this benchmark runs, we save it in the test spec so that it can be fetched
118119
# later by the upload script
119120
sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' default-ios-device-farm-appium-test-spec.yml.j2
@@ -124,6 +125,7 @@ jobs:
124125
125126
# Save the benchmark configs so that we can use it later in the dashboard
126127
echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
128+
echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
127129
128130
- name: Upload the spec
129131
uses: seemethere/upload-artifact-s3@v5
@@ -143,7 +145,7 @@ jobs:
143145
${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
144146
retention-days: 1
145147
if-no-files-found: error
146-
path: extension/benchmark/apple/Benchmark/${{ matrix.model }}_${{ matrix.config }}.json
148+
path: extension/benchmark/apple/Benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
147149

148150
export-models:
149151
name: export-models

backends/cadence/aot/compiler.py

-16
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,11 @@
1818
)
1919
from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
2020
from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
21-
22-
from executorch.backends.cadence.aot.replace_ops import ReplaceSafeSoftmaxWithSoftmax
2321
from executorch.backends.cadence.aot.utils import (
2422
get_default_memory_config,
2523
MemoryConfig,
26-
model_gm_has_SDPA,
2724
model_is_quantized,
2825
)
29-
from executorch.backends.transforms.decompose_sdpa import (
30-
DecomposeScaledDotProductAttention,
31-
)
3226
from executorch.devtools import generate_etrecord
3327
from executorch.exir import (
3428
EdgeCompileConfig,
@@ -91,16 +85,6 @@ def convert_pt2(
9185
.module()
9286
)
9387

94-
if model_gm_has_SDPA(model_gm):
95-
# Decompose SDPA
96-
DecomposeScaledDotProductAttention(False)(model_gm)
97-
98-
# Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882
99-
# for details).
100-
result = ReplaceSafeSoftmaxWithSoftmax()(model_gm)
101-
assert result is not None
102-
model_gm = result.graph_module
103-
10488
# Prepare
10589
prepared_model = prepare_pt2e(model_gm, quantizer)
10690

backends/cadence/aot/compiler_utils.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,16 @@ def get_transposed_dims(node: torch.fx.Node, dims: List[int]) -> List[int]:
129129

130130

131131
# Capture the effect of permute op on incoming dimension order
132-
def get_permuted_dims(node: torch.fx.Node, dims: Optional[List[int]]) -> List[int]:
132+
def get_permuted_dims(node: torch.fx.Node, dims: Optional[Sequence[int]]) -> List[int]:
133133
"""
134134
Given a permute node, and the incoming dimension ordering of the input
135135
tensor to the permute node, return the net effect of permute op on the
136136
dimension order.
137137
"""
138138
assert node.target == exir_ops.edge.aten.permute_copy.default
139139
# Permute each index of the dimension ordering (dims)
140-
permute_dims = node.args[1]
141-
assert isinstance(permute_dims, List)
140+
# pyre-fixme[6]: This combined typecheck isn't supported yet.
141+
permute_dims: List[int] = list(node.args[1])
142142
assert all(isinstance(x, int) for x in permute_dims)
143143
# If the dims is empty, we can simply return the permute order
144144
if not dims:

backends/cadence/aot/reorder_ops.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -438,9 +438,9 @@ def postpone_dequantize_op(self, graph_module: torch.fx.GraphModule) -> bool:
438438
args=(user, *node.args[1:]),
439439
)
440440
dequant_node.meta = user.meta.copy()
441-
# Remove meta["debug_handle"] on new node. Reassign it at the
442-
# caller level by calling generate_missing_debug_handles
443-
dequant_node.meta.pop("debug_handle")
441+
# Remove meta["debug_handle"] on new node if it exists.
442+
# Reassign it at the caller level by calling generate_missing_debug_handles
443+
dequant_node.meta.pop("debug_handle", None)
444444
user.replace_all_uses_with(dequant_node)
445445
dequant_node.args = (user, *node.args[1:])
446446

backends/cadence/aot/utils.py

-8
Original file line numberDiff line numberDiff line change
@@ -235,14 +235,6 @@ def print_ops_info(
235235
)
236236

237237

238-
def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool:
239-
for node in model_gm.graph.nodes:
240-
if node.op == "call_function":
241-
if node.target == torch.ops.aten.scaled_dot_product_attention.default:
242-
return True
243-
return False
244-
245-
246238
def save_pte_program(
247239
prog: ExecutorchProgramManager, model_name: str, output_dir: str = ""
248240
) -> None:

backends/vulkan/runtime/api/Context.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,13 @@ void Context::report_shader_dispatch_end() {
9090
vkapi::DescriptorSet Context::get_descriptor_set(
9191
const vkapi::ShaderInfo& shader_descriptor,
9292
const utils::uvec3& local_workgroup_size,
93-
const vkapi::SpecVarList& additional_constants) {
93+
const vkapi::SpecVarList& additional_constants,
94+
const uint32_t push_constants_size) {
9495
VkDescriptorSetLayout shader_layout =
9596
shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
9697

9798
VkPipelineLayout pipeline_layout =
98-
pipeline_layout_cache().retrieve(shader_layout);
99+
pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
99100

100101
vkapi::SpecVarList spec_constants = {
101102
SV(local_workgroup_size[0u]),
@@ -105,7 +106,7 @@ vkapi::DescriptorSet Context::get_descriptor_set(
105106
spec_constants.append(additional_constants);
106107

107108
VkPipeline pipeline = pipeline_cache().retrieve(
108-
{pipeline_layout_cache().retrieve(shader_layout),
109+
{pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
109110
shader_cache().retrieve(shader_descriptor),
110111
spec_constants});
111112

@@ -151,7 +152,7 @@ void Context::register_shader_dispatch(
151152
const VkDescriptorSetLayout shader_layout =
152153
shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
153154
const VkPipelineLayout pipeline_layout =
154-
pipeline_layout_cache().retrieve(shader_layout);
155+
pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
155156
cmd_.set_push_constants(
156157
pipeline_layout, push_constants_data, push_constants_size);
157158
}

backends/vulkan/runtime/api/Context.h

+6-3
Original file line numberDiff line numberDiff line change
@@ -188,12 +188,13 @@ class Context final {
188188
vkapi::DescriptorSet get_descriptor_set(
189189
const vkapi::ShaderInfo&,
190190
const utils::uvec3&,
191-
const vkapi::SpecVarList&);
191+
const vkapi::SpecVarList&,
192+
const uint32_t push_constants_size);
192193

193194
inline vkapi::DescriptorSet get_descriptor_set(
194195
const vkapi::ShaderInfo& shader_descriptor,
195196
const utils::uvec3& local_work_group_size) {
196-
return get_descriptor_set(shader_descriptor, local_work_group_size, {});
197+
return get_descriptor_set(shader_descriptor, local_work_group_size, {}, 0u);
197198
}
198199

199200
void register_shader_dispatch(
@@ -333,8 +334,10 @@ inline bool Context::submit_compute_job(
333334
dispatch_id);
334335

335336
// Factor out template parameter independent code to minimize code bloat.
337+
// Note that push constants are not exposed yet via this API, therefore the
338+
// push constants size is assumed to be 0.
336339
vkapi::DescriptorSet descriptor_set = get_descriptor_set(
337-
shader, local_work_group_size, specialization_constants);
340+
shader, local_work_group_size, specialization_constants, 0u);
338341

339342
detail::bind(
340343
descriptor_set,

backends/vulkan/runtime/api/containers/ParamsBuffer.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@ class ParamsBuffer final {
3131
vulkan_buffer_(
3232
context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}
3333

34-
template <typename Block>
35-
ParamsBuffer(Context* context_p, const VkDeviceSize nbytes)
34+
// The last bool argument, though unused, is required to disambiguate this
35+
// constructor from the one above.
36+
ParamsBuffer(Context* context_p, const VkDeviceSize nbytes, const bool unused)
3637
: context_p_(context_p),
3738
vulkan_buffer_(
3839
context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {}

backends/vulkan/runtime/api/containers/Tensor.cpp

+27-16
Original file line numberDiff line numberDiff line change
@@ -658,66 +658,77 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
658658
}
659659

660660
const vkapi::BufferBindInfo vTensor::sizes_ubo() {
661+
const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
662+
const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
661663
if (!uniforms_.buffer()) {
662-
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
664+
uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
663665
}
664666
if (sizes_uniform_offset_ == kUniformOffsetUnset) {
665667
VK_CHECK_COND(
666-
(uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
668+
(uniforms_size_ + size_per_ubo) <= max_ubo_size,
667669
"Uniform data allocation has exceeded Tensor uniform buffer size");
668670
sizes_uniform_offset_ = uniforms_size_;
669-
uniforms_size_ += kSizePerUniform;
671+
uniforms_size_ += size_per_ubo;
670672
uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
671673
}
672-
return vkapi::BufferBindInfo(uniforms_.buffer(), sizes_uniform_offset_);
674+
return vkapi::BufferBindInfo(
675+
uniforms_.buffer(), sizes_uniform_offset_, size_per_ubo);
673676
}
674677

675678
const vkapi::BufferBindInfo vTensor::strides_ubo() {
679+
const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
680+
const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
676681
if (!uniforms_.buffer()) {
677-
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
682+
uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
678683
}
679684
if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
680685
VK_CHECK_COND(
681-
(uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
686+
(uniforms_size_ + size_per_ubo) <= max_ubo_size,
682687
"Uniform data allocation has exceeded Tensor uniform buffer size");
683688
unsqueezed_strides_offset_ = uniforms_size_;
684-
uniforms_size_ += kSizePerUniform;
689+
uniforms_size_ += size_per_ubo;
685690
uniforms_.update(
686691
utils::make_whcn_ivec4(unsqueezed_strides_),
687692
unsqueezed_strides_offset_);
688693
}
689-
return vkapi::BufferBindInfo(uniforms_.buffer(), unsqueezed_strides_offset_);
694+
return vkapi::BufferBindInfo(
695+
uniforms_.buffer(), unsqueezed_strides_offset_, size_per_ubo);
690696
}
691697

692698
const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
699+
const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
700+
const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
693701
if (!uniforms_.buffer()) {
694-
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
702+
uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
695703
}
696704
if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
697705
VK_CHECK_COND(
698-
(uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
706+
(uniforms_size_ + size_per_ubo) <= max_ubo_size,
699707
"Uniform data allocation has exceeded Tensor uniform buffer size");
700708
logical_limits_uniform_offset_ = uniforms_size_;
701-
uniforms_size_ += kSizePerUniform;
709+
uniforms_size_ += size_per_ubo;
702710
uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
703711
}
704712
return vkapi::BufferBindInfo(
705-
uniforms_.buffer(), logical_limits_uniform_offset_);
713+
uniforms_.buffer(), logical_limits_uniform_offset_, size_per_ubo);
706714
}
707715

708716
const vkapi::BufferBindInfo vTensor::numel_ubo() {
717+
const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
718+
const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
709719
if (!uniforms_.buffer()) {
710-
uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
720+
uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
711721
}
712722
if (numel_uniform_offset_ == kUniformOffsetUnset) {
713723
VK_CHECK_COND(
714-
(uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
724+
(uniforms_size_ + size_per_ubo) <= max_ubo_size,
715725
"Uniform data allocation has exceeded Tensor uniform buffer size");
716726
numel_uniform_offset_ = uniforms_size_;
717-
uniforms_size_ += kSizePerUniform;
727+
uniforms_size_ += size_per_ubo;
718728
uniforms_.update(numel(), numel_uniform_offset_);
719729
}
720-
return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_);
730+
return vkapi::BufferBindInfo(
731+
uniforms_.buffer(), numel_uniform_offset_, size_per_ubo);
721732
}
722733

723734
size_t vTensor::staging_buffer_numel() const {

backends/vulkan/runtime/api/containers/Tensor.h

+7-10
Original file line numberDiff line numberDiff line change
@@ -348,16 +348,13 @@ class vTensor final {
348348
uint32_t numel_uniform_offset_;
349349
uint32_t logical_limits_uniform_offset_;
350350

351-
// Size allocated for each uniform
352-
// each uniform is assumed to be a vec of 4 ints to maintain 16 byte alignemnt
353-
constexpr static size_t kSizePerUniform = sizeof(utils::ivec4);
354-
// Total size of tensor's uniform buffer
355-
constexpr static size_t kMaxUniformBufferSize =
356-
4 * // we have 4 uniforms that are passed on to shaders
357-
kSizePerUniform;
358-
359-
// Initial value of uniform buffer offsets
360-
constexpr static uint32_t kUniformOffsetUnset = kMaxUniformBufferSize;
351+
// Maximum number of metadata fields that can be stored in the metadata UBO.
352+
// This is used to calculate the size of the UBO that should be allocated.
353+
constexpr static size_t kMaxMetadataFieldCount = 4;
354+
355+
// Initial value of uniform buffer offsets. 1 is selected as it is essentially
356+
// impossible for a ubo to have an offset of 1.
357+
constexpr static uint32_t kUniformOffsetUnset = 1;
361358

362359
vTensorStorage storage_;
363360

backends/vulkan/runtime/graph/ops/DispatchNode.cpp

+12-11
Original file line numberDiff line numberDiff line change
@@ -60,30 +60,31 @@ void DispatchNode::encode(ComputeGraph* graph) {
6060

6161
std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
6262

63+
std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
64+
uint32_t push_constants_offset = 0;
65+
66+
for (const auto& push_constant : push_constants_) {
67+
push_constants_offset += push_constant.write(
68+
push_constants_data.data(),
69+
push_constants_offset,
70+
kMaxPushConstantSize);
71+
}
72+
6373
context->report_shader_dispatch_start(
6474
shader_.kernel_name,
6575
global_workgroup_size_,
6676
local_workgroup_size_,
6777
node_id_);
6878

69-
vkapi::DescriptorSet descriptor_set =
70-
context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
79+
vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
80+
shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
7181

7282
uint32_t idx = 0;
7383
idx = bind_values_to_descriptor_set(
7484
graph, args_, pipeline_barrier, descriptor_set, idx);
7585

7686
bind_params_to_descriptor_set(params_, descriptor_set, idx);
7787

78-
std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
79-
uint32_t push_constants_offset = 0;
80-
81-
for (const auto& push_constant : push_constants_) {
82-
push_constants_offset += push_constant.write(
83-
push_constants_data.data(),
84-
push_constants_offset,
85-
kMaxPushConstantSize);
86-
}
8788
context->register_shader_dispatch(
8889
descriptor_set,
8990
pipeline_barrier,

backends/vulkan/runtime/graph/ops/PrepackNode.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
7575

7676
{
7777
vkapi::PipelineBarrier pipeline_barrier{};
78-
vkapi::DescriptorSet descriptor_set =
79-
context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
78+
vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
79+
shader_, local_workgroup_size_, spec_vars_, 0u);
8080

8181
uint32_t idx = 0;
8282
bind_tensor_to_descriptor_set(

0 commit comments

Comments
 (0)