Skip to content

Commit c350e58

Browse files
SS-JIAfacebook-github-bot
authored andcommitted
Deprecate gpu_sizes_ubo() and extents(); also toggle packing layout via specialization constants (#3181)
Summary: Pull Request resolved: #3181 ## Context This changeset cleans up how shaders consume tensor metadata in two ways: ### Pass in Packing Layout via Specialization Shader The packing layout of a tensor determines how to convert between tensor indices and physical texture coordinates. Currently, the packing layout is determined by generating a completely new variant of a shader. However, this is rather expensive for build size. Specialization constants support was added a while back, which enables packing layout to be communicated to the shader via a specialization constant. This is a much better and natural way for shaders to determine the packing layout of its tensors and vary its behaviour. The primary benefit of this is that we can vastly reduce the number of variants that are generated. Generating shader variants for combinations of dtypes and memory layouts can lead to combinatorial explosion of build size. Note that dtype cannot be passed as a specialization constant since it impacts the types used in the layout portion of a shader. ### Deprecate GPU sizes and Extents Currently there are 3 representations of the tensor's sizes; `cpu_sizes()`, `gpu_sizes()`, and `extents()`. The GPU sizes is a simple modification of the CPU sizes where the packed dim is aligned to the next multiple of 4. Extents represents the physical extents of the image texture used to store the image. However, often times shaders need to reference the original sizes of the tensor so we end up passing two different representations of the tensor sizes. The CPU sizes and extents is used to determine out of bounds elements and the GPU sizes is used to convert between logical tensor indices and physical texture coordinates. Since the GPU sizes and extents are easily determined from the CPU sizes given the packing layout, deprecate GPU sizes and use CPU sizes exclusively as the canonical tensor sizes. Hence `cpu_sizes()` is renamed to simple `sizes()`. The primary benefit of this change is such: 1. Less confusion over how to reference the tensor sizes 2. Fewer descriptors to bind when constructing compute pipelines 3. Fewer uniform buffers to update when resizing tensors between inferences. ghstack-source-id: 223317313 Reviewed By: yipjustin Differential Revision: D56377775 fbshipit-source-id: 31235fbdf0b694e24b8c6fc0b40c56ddb818439d
1 parent d89eabb commit c350e58

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+892
-860
lines changed

backends/vulkan/runtime/api/Tensor.cpp

Lines changed: 16 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -139,10 +139,8 @@ vTensor::vTensor(
139139
// Calculate sizes and strides
140140
sizes_(sizes.begin(), sizes.end()),
141141
gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
142-
// Utility Uniform Buffers that can be passed to shaders as arguments
143-
cpu_sizes_uniform_(),
144-
gpu_sizes_uniform_(),
145-
extents_uniform_(),
142+
// Utility Uniform Buffer that can be passed to shaders as arguments
143+
sizes_uniform_(context, api::utils::make_whcn_ivec4(sizes_)),
146144
// Construct Tensor storage
147145
storage_(
148146
context,
@@ -189,35 +187,6 @@ api::VulkanBuffer& vTensor::buffer(
189187
return storage_.buffer_;
190188
}
191189

192-
const api::BufferBindInfo vTensor::cpu_sizes_ubo() {
193-
if (!cpu_sizes_uniform_.buffer()) {
194-
cpu_sizes_uniform_ = api::UniformParamsBuffer(
195-
storage_.context_, api::utils::make_whcn_ivec4(sizes_));
196-
}
197-
return api::BufferBindInfo(cpu_sizes_uniform_.buffer());
198-
}
199-
200-
const api::BufferBindInfo vTensor::gpu_sizes_ubo() {
201-
if (!gpu_sizes_uniform_.buffer()) {
202-
gpu_sizes_uniform_ = api::UniformParamsBuffer(
203-
storage_.context_, api::utils::make_whcn_ivec4(gpu_sizes_));
204-
}
205-
return api::BufferBindInfo(gpu_sizes_uniform_.buffer());
206-
}
207-
208-
const api::BufferBindInfo vTensor::extents_ubo() {
209-
if (!extents_uniform_.buffer()) {
210-
extents_uniform_ = api::UniformParamsBuffer(
211-
storage_.context_,
212-
api::utils::uvec4(
213-
{storage_.extents_.data[0],
214-
storage_.extents_.data[1],
215-
storage_.extents_.data[2],
216-
1u}));
217-
}
218-
return api::BufferBindInfo(extents_uniform_.buffer());
219-
}
220-
221190
VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
222191
switch (storage_type()) {
223192
case api::kBuffer:
@@ -255,24 +224,7 @@ void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
255224
void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
256225
sizes_ = new_sizes;
257226
gpu_sizes_ = calc_gpu_sizes(sizes_, memory_layout_, storage_type());
258-
api::utils::uvec3 virtual_extents =
259-
create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
260-
261-
if (cpu_sizes_uniform_.buffer()) {
262-
cpu_sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
263-
}
264-
265-
if (gpu_sizes_uniform_.buffer()) {
266-
gpu_sizes_uniform_.update(api::utils::make_whcn_ivec4(gpu_sizes_));
267-
}
268-
269-
if (extents_uniform_.buffer()) {
270-
extents_uniform_.update(api::utils::uvec4(
271-
{virtual_extents.data[0],
272-
virtual_extents.data[1],
273-
virtual_extents.data[2],
274-
1u}));
275-
}
227+
sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
276228
}
277229

278230
void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
@@ -284,6 +236,19 @@ void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
284236
}
285237

286238
void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
239+
if (storage_type() != api::kBuffer) {
240+
api::utils::uvec3 virtual_extents =
241+
create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
242+
243+
bool valid_resize = virtual_extents.data[0] <= extents().data[0];
244+
valid_resize = valid_resize && virtual_extents.data[1] <= extents().data[1];
245+
valid_resize = valid_resize && virtual_extents.data[2] <= extents().data[2];
246+
247+
VK_CHECK_COND(
248+
valid_resize,
249+
"Cannot use virtual resize if new sizes requires a larger texture.");
250+
}
251+
287252
update_size_metadata(new_sizes);
288253
}
289254

backends/vulkan/runtime/api/Tensor.h

Lines changed: 6 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -118,17 +118,7 @@ class vTensor final {
118118

119119
// A Vulkan uniform buffer containing the tensor sizes in WHCN that can be
120120
// passed into a shader.
121-
api::UniformParamsBuffer cpu_sizes_uniform_;
122-
123-
// A Vulkan uniform buffer containing the GPU tensor sizes in WHCN that can
124-
// be passed into a shader. GPU sizes refers to the sizes of the tensor after
125-
// padding has been applied to one dimension to align it to the next multiple
126-
// of 4.
127-
api::UniformParamsBuffer gpu_sizes_uniform_;
128-
129-
// A Vulkan uniform buffer containing the image extents of the underlying
130-
// image texture that can be passed into a shader.
131-
api::UniformParamsBuffer extents_uniform_;
121+
api::UniformParamsBuffer sizes_uniform_;
132122

133123
vTensorStorage storage_;
134124

@@ -203,25 +193,12 @@ class vTensor final {
203193
}
204194

205195
/*
206-
* Get a uniform buffer object containing the tensor sizes to use in a compute
207-
* shader. Note that the UBO will be created the first time this function is
208-
* called.
209-
*/
210-
const api::BufferBindInfo cpu_sizes_ubo();
211-
212-
/*
213-
* Get a uniform buffer object containing the tensor GPU sizes to use in a
214-
* compute shader. Note that the UBO will be created the first time this
215-
* function is called.
196+
* Get the binding information for the uniform buffer object containing the
197+
* tensor sizes to use in a compute shader.
216198
*/
217-
const api::BufferBindInfo gpu_sizes_ubo();
218-
219-
/*
220-
* Get a uniform buffer object containing the image extents to use in a
221-
* compute shader. Note that the UBO will be created the first time this
222-
* function is called.
223-
*/
224-
const api::BufferBindInfo extents_ubo();
199+
inline const api::BufferBindInfo sizes_ubo() {
200+
return api::BufferBindInfo(sizes_uniform_.buffer());
201+
}
225202

226203
inline size_t numel() const {
227204
return api::utils::multiply_integers(sizes());

backends/vulkan/runtime/graph/ops/ExecuteNode.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,17 @@ ExecuteNode::ExecuteNode(
2121
const api::utils::uvec3& local_workgroup_size,
2222
const std::vector<ArgGroup>& args,
2323
const api::ParamsBindList& params,
24+
const api::SpecVarList& spec_vars,
2425
const ResizeFunction& resize_fn,
25-
const std::vector<ValueRef>& resize_args,
26-
const api::SpecVarList& spec_vars)
26+
const std::vector<ValueRef>& resize_args)
2727
: shader_(shader),
2828
global_workgroup_size_(global_workgroup_size),
2929
local_workgroup_size_(local_workgroup_size),
3030
args_(args),
3131
params_(params),
32+
spec_vars_(spec_vars),
3233
resize_fn_(resize_fn),
33-
resize_args_(resize_args),
34-
spec_vars_(spec_vars) {
34+
resize_args_(resize_args) {
3535
graph.update_descriptor_counts(shader, /*execute = */ true);
3636
}
3737

backends/vulkan/runtime/graph/ops/ExecuteNode.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@ class ExecuteNode final {
5555
const api::utils::uvec3& local_workgroup_size,
5656
const std::vector<ArgGroup>& args,
5757
const api::ParamsBindList& params,
58+
const api::SpecVarList& spec_vars = {},
5859
const ResizeFunction& resize_fn = nullptr,
59-
const std::vector<ValueRef>& resize_args = {},
60-
const api::SpecVarList& spec_vars = {});
60+
const std::vector<ValueRef>& resize_args = {});
6161

6262
~ExecuteNode() = default;
6363

@@ -75,9 +75,9 @@ class ExecuteNode final {
7575
const api::utils::uvec3 local_workgroup_size_;
7676
const std::vector<ArgGroup> args_;
7777
const api::ParamsBindList params_;
78+
const api::SpecVarList spec_vars_;
7879
const ResizeFunction resize_fn_;
7980
const std::vector<ValueRef> resize_args_;
80-
const api::SpecVarList spec_vars_;
8181
};
8282

8383
} // namespace vkcompute

backends/vulkan/runtime/graph/ops/PrepackNode.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,16 @@ PrepackNode::PrepackNode(
3131
const api::utils::uvec3& local_workgroup_size,
3232
const ValueRef tref,
3333
const ValueRef packed,
34-
const api::ParamsBindList& params)
34+
const api::ParamsBindList& params,
35+
const api::SpecVarList& spec_vars)
3536
: shader_(shader),
3637
noop_shader_(get_noop_shader(graph, packed)),
3738
global_workgroup_size_(global_workgroup_size),
3839
local_workgroup_size_(local_workgroup_size),
3940
tref_(tref),
4041
packed_(packed),
41-
params_(params) {
42+
params_(params),
43+
spec_vars_(spec_vars) {
4244
graph.update_descriptor_counts(shader, /*execute = */ false);
4345
graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
4446
}
@@ -75,7 +77,7 @@ void PrepackNode::encode(ComputeGraph* graph) {
7577
{
7678
api::PipelineBarrier pipeline_barrier{};
7779
api::DescriptorSet descriptor_set =
78-
context->get_descriptor_set(shader_, local_workgroup_size_);
80+
context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
7981

8082
uint32_t idx = 0;
8183
bind_tensor_to_descriptor_set(

backends/vulkan/runtime/graph/ops/PrepackNode.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ class PrepackNode final {
3333
const api::utils::uvec3& local_workgroup_size,
3434
const ValueRef tref,
3535
const ValueRef packed,
36-
const api::ParamsBindList& params);
36+
const api::ParamsBindList& params,
37+
const api::SpecVarList& spec_vars = {});
3738

3839
~PrepackNode() = default;
3940

@@ -47,6 +48,7 @@ class PrepackNode final {
4748
const ValueRef tref_;
4849
const ValueRef packed_;
4950
const api::ParamsBindList params_;
51+
const api::SpecVarList spec_vars_;
5052

5153
private:
5254
api::StorageBuffer create_staging_buffer(ComputeGraph* graph);

backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@
1212

1313
#define VEC4_T ${texel_type(DTYPE)}
1414

15-
#define to_tensor_idx to_tensor_idx_${PACKING}
16-
#define to_texture_pos to_texture_pos_${PACKING}
17-
1815
#define op(X, Y, A) ${OPERATOR}
1916

2017
#include "broadcasting_utils.h"
@@ -27,59 +24,56 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
2724
layout(set = 0, binding = 2) uniform PRECISION sampler3D image_other;
2825

2926
layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
30-
ivec4 data;
31-
}
32-
out_sizes;
27+
ivec4 out_sizes;
28+
};
3329

3430
layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
35-
ivec4 data;
36-
}
37-
in_sizes;
31+
ivec4 in_sizes;
32+
};
3833

3934
layout(set = 0, binding = 5) uniform PRECISION restrict OtherSizes {
40-
ivec4 data;
41-
}
42-
other_sizes;
35+
ivec4 other_sizes;
36+
};
4337

4438
layout(set = 0, binding = 6) uniform PRECISION restrict BroadcastParams {
45-
ivec2 data;
46-
}
47-
broadcast_params;
39+
ivec2 broadcast_params;
40+
};
4841

4942
layout(set = 0, binding = 7) uniform PRECISION restrict Alpha {
50-
float data;
51-
}
52-
alpha;
43+
float alpha;
44+
};
5345

5446
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
5547

48+
layout(constant_id = 3) const int packed_dim = C_DIM;
49+
5650
void main() {
5751
const ivec3 pos = ivec3(gl_GlobalInvocationID);
58-
const ivec4 idx = to_tensor_idx(pos, out_sizes.data);
52+
const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
5953

60-
if (any(greaterThanEqual(idx, out_sizes.data))) {
54+
if (any(greaterThanEqual(idx, out_sizes))) {
6155
return;
6256
}
6357

64-
ivec4 in_idx = broadcast_indices(idx, in_sizes.data);
58+
ivec4 in_idx = broadcast_indices(idx, in_sizes);
6559
VEC4_T in_texel = VEC4_T(texelFetch(
6660
image_in,
67-
to_texture_pos(in_idx, in_sizes.data),
61+
to_texture_pos(in_idx, in_sizes, packed_dim),
6862
0));
6963

70-
ivec4 other_idx = broadcast_indices(idx, other_sizes.data);
64+
ivec4 other_idx = broadcast_indices(idx, other_sizes);
7165
VEC4_T other_texel = VEC4_T(texelFetch(
7266
image_other,
73-
to_texture_pos(other_idx, other_sizes.data),
67+
to_texture_pos(other_idx, other_sizes, packed_dim),
7468
0));
7569

7670
// Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
77-
if (broadcast_params.data.x > 0) {
71+
if (broadcast_params.x > 0) {
7872
in_texel = in_texel.xxxx;
7973
}
80-
if (broadcast_params.data.y > 0) {
74+
if (broadcast_params.y > 0) {
8175
other_texel = other_texel.xxxx;
8276
}
8377

84-
imageStore(image_out, pos, VEC4_T(op(in_texel, other_texel, alpha.data)));
78+
imageStore(image_out, pos, VEC4_T(op(in_texel, other_texel, alpha)));
8579
}

backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@ binary_op:
1111
DTYPE: float
1212
PACKING: C_packed
1313
generate_variant_forall:
14-
PACKING:
15-
- VALUE: C_packed
16-
- VALUE: W_packed
17-
- VALUE: H_packed
1814
DTYPE:
1915
- VALUE: half
2016
- VALUE: float

0 commit comments

Comments
 (0)