diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index a073919c696..5bfd6f78dcd 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -195,6 +196,68 @@ class GraphBuilder { } }; +// +// Execution tools +// + +bool maybe_resize_input( + ComputeGraph* graph, + const size_t input_i, + exec_aten::Tensor& et_tensor) { + ValueRef in_tensor_ref = graph->inputs()[input_i].value; + vTensor& in_tensor = graph->get_val(in_tensor_ref).toTensor(); + + ET_CHECK_MSG( + et_tensor.dim() == in_tensor.sizes().size(), + "Cannot resize input tensor: old ndim %zu does not match new ndim %zu", + static_cast(in_tensor.sizes().size()), + static_cast(et_tensor.dim())); + + bool should_resize = false; + std::vector new_sizes(et_tensor.dim()); + for (size_t i = 0; i < et_tensor.dim(); i++) { + if (in_tensor.sizes()[i] != et_tensor.sizes()[i]) { + should_resize = true; + } + new_sizes.at(i) = et_tensor.sizes()[i]; + } + + if (should_resize) { + graph->resize_input(input_i, new_sizes); + } + + ET_CHECK_MSG( + in_tensor.numel() == et_tensor.numel(), + "Vulkan tensor numel %zu does not match ET tensor numel %zu", + static_cast(in_tensor.numel()), + static_cast(et_tensor.numel())); + + return should_resize; +} + +void maybe_resize_output( + ComputeGraph* graph, + const size_t output_i, + exec_aten::Tensor& et_tensor) { + ValueRef out_tensor_ref = graph->outputs()[output_i].value; + vTensor& out_tensor = graph->get_val(out_tensor_ref).toTensor(); + + exec_aten::SizesType new_output_size[kTensorDimensionLimit]; + size_t ndim = out_tensor.sizes().size(); + for (int i = 0; i < ndim; ++i) { + new_output_size[i] = out_tensor.sizes()[i]; + } + + exec_aten::ArrayRef output_size{new_output_size, ndim}; + Error err = resize_tensor(et_tensor, output_size); + + ET_CHECK_MSG(err == Error::Ok, "Failed to resize output tensor."); +} + +// +// VulkanBackend class +// + class VulkanBackend final : public PyTorchBackendInterface { public: ~VulkanBackend() override = default; @@ -273,20 +336,28 @@ class VulkanBackend final : public PyTorchBackendInterface { ComputeGraph* compute_graph = static_cast(handle); const size_t num_inputs = compute_graph->inputs().size(); + bool should_propagate_resize = false; for (size_t i = 0; i < num_inputs; i++) { + bool was_resized = + maybe_resize_input(compute_graph, i, args[i]->toTensor()); + should_propagate_resize = should_propagate_resize || was_resized; compute_graph->copy_into_staging( - compute_graph->inputs()[i], + compute_graph->inputs()[i].staging, args[i]->toTensor().const_data_ptr(), args[i]->toTensor().numel()); } + if (should_propagate_resize) { + compute_graph->propagate_resize(); + } compute_graph->execute(); for (size_t i = 0; i < compute_graph->outputs().size(); i++) { + maybe_resize_output(compute_graph, i, args[num_inputs + i]->toTensor()); // args holds inputs directly followed by outputs, so the i'th output // for compute_graph corresponds to the (i + num_inputs)'th arg compute_graph->copy_from_staging( - compute_graph->outputs()[i], + compute_graph->outputs()[i].staging, args[num_inputs + i]->toTensor().mutable_data_ptr(), args[num_inputs + i]->toTensor().numel()); } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 6583d4a5a3e..1cc2d161be8 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -135,10 +135,10 @@ ValueRef ComputeGraph::set_input_tensor( vTensor& tensor = get_val(idx).toTensor(); ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel()); add_staging_to_tensor_node(*this, staging_idx, idx); - inputs_.push_back(staging_idx); + inputs_.push_back({idx, staging_idx}); return staging_idx; } - inputs_.push_back(idx); + inputs_.push_back({idx, kDummyValueRef}); return idx; } @@ -149,10 +149,10 @@ ValueRef ComputeGraph::set_output_tensor( vTensor& tensor = get_val(idx).toTensor(); ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel()); add_tensor_to_staging_node(*this, idx, staging_idx); - outputs_.push_back(staging_idx); + outputs_.push_back({idx, staging_idx}); return staging_idx; } - outputs_.push_back(idx); + outputs_.push_back({idx, kDummyValueRef}); return idx; } @@ -241,6 +241,19 @@ void ComputeGraph::execute() const { fence.wait(); } +void ComputeGraph::resize_input( + const int64_t idx, + const std::vector& new_sizes) { + IOValueRef io_val = inputs_.at(idx); + get_val(io_val.value).toTensor().virtual_resize(new_sizes); +} + +void ComputeGraph::propagate_resize() { + for (std::unique_ptr& node : execute_nodes_) { + node->trigger_resize(this); + } +} + } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 1253111150d..47c45f574e7 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -68,8 +68,8 @@ class ComputeGraph final { std::vector> prepack_nodes_; std::vector> execute_nodes_; - std::vector inputs_; - std::vector outputs_; + std::vector inputs_; + std::vector outputs_; public: // @@ -80,11 +80,11 @@ class ComputeGraph final { return context_.get(); } - inline std::vector& inputs() { + inline std::vector& inputs() { return inputs_; } - inline std::vector& outputs() { + inline std::vector& outputs() { return outputs_; } @@ -201,6 +201,13 @@ class ComputeGraph final { void encode_execute(); void execute() const; + + // + // Dynamic Shape support + // + + void resize_input(const int64_t idx, const std::vector& new_sizes); + void propagate_resize(); }; template diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp index 496a94238b4..e9d5ab18b42 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp @@ -22,12 +22,16 @@ ExecuteNode::ExecuteNode( const api::utils::uvec3& global_workgroup_size, const api::utils::uvec3& local_workgroup_size, const std::vector& args, - const std::vector>& params) + const std::vector>& params, + const ResizeFunction& resize_fn, + const std::vector& resize_args) : shader_(shader), global_workgroup_size_(global_workgroup_size), local_workgroup_size_(local_workgroup_size), args_(args), - params_(params) { + params_(params), + resize_fn_(resize_fn), + resize_args_(resize_args) { graph.update_descriptor_counts(shader, /*execute = */ true); } diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index 5e3a1e003b8..9d9beeab651 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -47,18 +47,31 @@ class ExecuteNode final { friend class ComputeGraph; public: + using ResizeFunction = const std::function&, + const std::vector&)>; + ExecuteNode( ComputeGraph& graph, const api::ShaderInfo& shader, const api::utils::uvec3& global_workgroup_size, const api::utils::uvec3& local_workgroup_size, const std::vector& args, - const std::vector>& params); + const std::vector>& params, + const ResizeFunction& resize_fn = nullptr, + const std::vector& resize_args = {}); ~ExecuteNode() = default; void encode(ComputeGraph* graph); + inline void trigger_resize(ComputeGraph* graph) { + if (resize_fn_ != nullptr) { + resize_fn_(graph, args_, resize_args_); + } + } + protected: const api::ShaderInfo shader_; const api::utils::uvec3 global_workgroup_size_; @@ -66,6 +79,8 @@ class ExecuteNode final { const std::vector args_; // TODO(T180906457): allow re-computing param buffers. std::vector> params_; + const ResizeFunction resize_fn_; + const std::vector resize_args_; }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index 8aa1382f7e3..1d637ecb343 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -19,6 +19,28 @@ namespace at { namespace native { namespace vulkan { +void resize_binary_op_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + vTensor& out = graph->get_val(args[0].refs[0]).toTensor(); + vTensor& self = graph->get_val(args[1].refs[0]).toTensor(); + vTensor& other = graph->get_val(args[1].refs[1]).toTensor(); + + std::vector new_out_sizes( + std::max(self.sizes().size(), other.sizes().size())); + + // Match the sizes in reverse because sizes are in NCHW order + for (int i = -1; i >= -new_out_sizes.size(); --i) { + new_out_sizes.at(new_out_sizes.size() + i) = std::max( + api::utils::val_at(i, self.sizes()), + api::utils::val_at(i, other.sizes())); + } + + out.virtual_resize(new_out_sizes); +} + void add_binary_op_node( ComputeGraph& graph, const ValueRef in1, @@ -52,12 +74,16 @@ void add_binary_op_node( VK_KERNEL_FROM_STR(kernel_name.str()), global_size, local_size, + // Inputs and Outputs {{out, api::MemoryAccessType::WRITE}, {{arg1, arg2}, api::MemoryAccessType::READ}}, + // Shader params buffers {t_out.gpu_sizes_ubo(), t_in1.gpu_sizes_ubo(), t_in2.gpu_sizes_ubo(), - graph.create_params_buffer(alpha_val)})); + graph.create_params_buffer(alpha_val)}, + // Resizing + resize_binary_op_node)); } #define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name) \ diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py index 572ef018bc2..4bd0c527605 100644 --- a/backends/vulkan/serialization/vulkan_graph_builder.py +++ b/backends/vulkan/serialization/vulkan_graph_builder.py @@ -218,13 +218,14 @@ def process_getattr_node(self, node: Node) -> None: self.create_tensor_values(node) def process_output_node(self, node: Node) -> None: - if node.all_input_nodes[0] not in self.node_to_value_ids: - raise AssertionError( - "Cannot find input to output node in node_to_value_ids. This means the " - "output node is being serialized before its corresponding internal node " - "which is not allowed." - ) - self.output_ids.append(self.node_to_value_ids[node.all_input_nodes[0]]) + for out_node in node.all_input_nodes: + if out_node not in self.node_to_value_ids: + raise AssertionError( + "Cannot find input to output node in node_to_value_ids. This means " + "the output node is being serialized before its corresponding " + "internal node which is not allowed." + ) + self.output_ids.append(self.node_to_value_ids[out_node]) def process_node(self, node: Node) -> None: if node.op == "placeholder": diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl index 02f7351d065..1e7670d1ccf 100644 --- a/backends/vulkan/targets.bzl +++ b/backends/vulkan/targets.bzl @@ -146,6 +146,7 @@ def define_common_targets(is_fbcode = False): ":vk_delegate_schema", ":vulkan_graph_runtime", "//executorch/runtime/backend:interface", + "//executorch/runtime/core/exec_aten/util:tensor_util", ], define_static_target = False, # VulkanBackend.cpp needs to compile with executor as whole diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 8a491497c31..7def98df711 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -14,7 +14,7 @@ from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend from executorch.exir import EdgeProgramManager, to_edge -from torch.export import export, ExportedProgram +from torch.export import Dim, export, ExportedProgram ctypes.CDLL("libvulkan.so.1") @@ -54,13 +54,17 @@ def lower_module_and_test_output( sample_inputs: Tuple[torch.Tensor], atol=1e-03, rtol=1e-01, + dynamic_shapes=None, + test_inputs=None, ): """ Helper testing function that takes a torch.nn.Module and lowers it to Vulkan with the given sample inputs. It then runs the lowered module and compares its outputs with the outputs of the eager module. """ - program: ExportedProgram = export(model, sample_inputs) + program: ExportedProgram = export( + model, sample_inputs, dynamic_shapes=dynamic_shapes + ) edge_program: EdgeProgramManager = to_edge(program) edge_program = edge_program.to_backend(VulkanPartitioner()) @@ -80,6 +84,19 @@ def lower_module_and_test_output( self.assert_outputs_equal(model_output, ref_output, atol=atol, rtol=rtol) + if test_inputs is not None: + for test_input in test_inputs: + # pyre-fixme[16]: Module `pytree` has no attribute `tree_flatten`. + test_inputs_flattened, _ = tree_flatten(test_input) + model_output = executorch_module.run_method( + "forward", tuple(test_inputs_flattened) + ) + ref_output = model(*test_input) + + self.assert_outputs_equal( + model_output, ref_output, atol=atol, rtol=rtol + ) + def test_vulkan_backend_add(self): # This test is the simplest test by manually lowering some submodules, we can use paritioner for auto detecting lowerable parts class AddModule(torch.nn.Module): @@ -93,12 +110,12 @@ def forward(self, x, y): return z add_module = AddModule() - model_inputs = ( + sample_inputs = ( torch.rand(size=(2, 3), dtype=torch.float32), torch.rand(size=(2, 3), dtype=torch.float32), ) - self.lower_module_and_test_output(add_module, model_inputs) + self.lower_module_and_test_output(add_module, sample_inputs) def test_vulkan_backend_internal_data(self): class InternalDataModule(torch.nn.Module): @@ -114,12 +131,12 @@ def forward(self, x, y): return z internal_data_module = InternalDataModule() - model_inputs = ( + sample_inputs = ( torch.rand(size=(2, 3), dtype=torch.float32), torch.rand(size=(2, 3), dtype=torch.float32), ) - self.lower_module_and_test_output(internal_data_module, model_inputs) + self.lower_module_and_test_output(internal_data_module, sample_inputs) def test_vulkan_backend_sub(self): class SubModule(torch.nn.Module): @@ -133,12 +150,12 @@ def forward(self, x, y): return z sub_module = SubModule() - model_inputs = ( + sample_inputs = ( torch.rand(size=(2, 3), dtype=torch.float32), torch.rand(size=(2, 3), dtype=torch.float32), ) - self.lower_module_and_test_output(sub_module, model_inputs) + self.lower_module_and_test_output(sub_module, sample_inputs) def test_vulkan_backend_mul(self): class MulModule(torch.nn.Module): @@ -152,12 +169,12 @@ def forward(self, x, y): return z mul_module = MulModule() - model_inputs = ( + sample_inputs = ( torch.rand(size=(2, 3), dtype=torch.float32), torch.rand(size=(2, 3), dtype=torch.float32), ) - self.lower_module_and_test_output(mul_module, model_inputs) + self.lower_module_and_test_output(mul_module, sample_inputs) def test_vulkan_backend_div(self): class DivModule(torch.nn.Module): @@ -171,12 +188,12 @@ def forward(self, x, y): return z div_module = DivModule() - model_inputs = ( + sample_inputs = ( torch.rand(size=(2, 3), dtype=torch.float32), torch.rand(size=(2, 3), dtype=torch.float32), ) - self.lower_module_and_test_output(div_module, model_inputs) + self.lower_module_and_test_output(div_module, sample_inputs) def test_vulkan_backend_arithmetic(self): class ArithmeticModule(torch.nn.Module): @@ -192,12 +209,12 @@ def forward(self, x, y): return z arithmetic_module = ArithmeticModule() - model_inputs = ( + sample_inputs = ( torch.rand(size=(2, 3), dtype=torch.float32), torch.rand(size=(2, 3), dtype=torch.float32), ) - self.lower_module_and_test_output(arithmetic_module, model_inputs) + self.lower_module_and_test_output(arithmetic_module, sample_inputs) def test_vulkan_backend_floor_div(self): class FloorDivModule(torch.nn.Module): @@ -209,14 +226,14 @@ def forward(self, x, y): return z floor_div_module = FloorDivModule() - model_inputs = ( + sample_inputs = ( torch.rand(size=(2, 3), dtype=torch.float32) * 10.0, torch.rand(size=(2, 3), dtype=torch.float32) + 1.0, ) # absolute tolerance is 1 because of flooring self.lower_module_and_test_output( - floor_div_module, model_inputs, atol=1.0 + 1e-03 + floor_div_module, sample_inputs, atol=1.0 + 1e-03 ) def test_vulkan_backend_pow(self): @@ -229,12 +246,12 @@ def forward(self, x, y): return z pow_module = PowModule() - model_inputs = ( + sample_inputs = ( torch.rand(size=(2, 3), dtype=torch.float32), torch.rand(size=(2, 3), dtype=torch.float32), ) - self.lower_module_and_test_output(pow_module, model_inputs) + self.lower_module_and_test_output(pow_module, sample_inputs) def test_vulkan_backend_partial(self): class SimpleModel(torch.nn.Module): @@ -248,6 +265,41 @@ def forward(self, x): return self.linear(x + self.offset_1) - self.offset_2 model = SimpleModel() - model_inputs = (torch.rand(size=(2, 10), dtype=torch.float32),) + sample_inputs = (torch.rand(size=(2, 10), dtype=torch.float32),) - self.lower_module_and_test_output(model, model_inputs) + self.lower_module_and_test_output(model, sample_inputs) + + def test_vulkan_backend_partial_dynamic_shapes(self): + class SimpleModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.branch1 = torch.nn.Sequential( + torch.nn.Linear(64, 64), torch.nn.ReLU() + ) + self.branch2 = torch.nn.Sequential( + torch.nn.Linear(128, 64), torch.nn.ReLU() + ) + self.buffer_1 = torch.ones((1, 64)) * 0.5 + self.buffer_2 = torch.ones((1, 64)) * 1.4 + + def forward(self, x1, x2): + out1 = self.branch1(x1) + out2 = self.branch2(x2) + return (out1 + self.buffer_1 + out2) * self.buffer_2 + + model = SimpleModel() + sample_inputs = (torch.randn(32, 64), torch.randn(32, 128)) + batch = Dim("batch", max=32) + dynamic_shapes = {"x1": {0: batch}, "x2": {0: batch}} + + test_inputs = [ + (torch.randn(15, 64), torch.randn(15, 128)), + (torch.randn(6, 64), torch.randn(6, 128)), + (torch.randn(30, 64), torch.randn(30, 128)), + (torch.randn(20, 64), torch.randn(20, 128)), + (torch.randn(19, 64), torch.randn(19, 128)), + ] + + self.lower_module_and_test_output( + model, sample_inputs, dynamic_shapes=dynamic_shapes, test_inputs=test_inputs + ) diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index af6321d601b..f678c4c3818 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -545,7 +545,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { } } -TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_manual_resize) { +TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { GraphConfig config; ComputeGraph graph(config); @@ -643,4 +643,38 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_manual_resize) { EXPECT_TRUE(val == val_out); } } + + std::vector> new_sizes_list_2 = { + {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}}; + + for (auto& new_sizes : new_sizes_list_2) { + graph.resize_input(0, new_sizes); + graph.resize_input(1, new_sizes); + graph.resize_input(2, new_sizes); + graph.propagate_resize(); + + // Check output shape + EXPECT_TRUE(graph.get_val(out.value).toTensor().sizes() == new_sizes); + + float val_a = new_sizes[1] + 6.0f; + float val_b = new_sizes[2] + 2.5f; + float val_d = new_sizes[0] + 4.0f; + float val_out = (val_a + val_b) * val_d; + + fill_vtensor(graph, a, val_a); + fill_vtensor(graph, b, val_b); + fill_vtensor(graph, d, val_d); + + // Execute graph + graph.execute(); + + EXTRACT_TENSOR(out); + + // Sanity check that the values are correct + int i = 0; + for (const auto& val : data_out) { + ASSERT_TRUE(val == val_out); + ++i; + } + } } diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py index 293d114e8d3..27f42d1ec8f 100644 --- a/backends/vulkan/vulkan_preprocess.py +++ b/backends/vulkan/vulkan_preprocess.py @@ -22,6 +22,8 @@ from executorch.exir.passes import MemoryPlanningPass, SpecPropPass +from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass + from executorch.exir.program._program import _copy_module from torch import dtype, float32 @@ -46,6 +48,7 @@ def preprocess( # noqa: C901 ) -> PreprocessResult: passes = [ SpecPropPass(), + ConstraintBasedSymShapeEvalPass(), MemoryPlanningPass("greedy"), ]