diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index 5aa805dc1b3..26f461c062f 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -527,8 +527,6 @@ def register_view_op(features: OpFeatures): exir_ops.edge.aten.flip.default, exir_ops.edge.aten.index_select.default, exir_ops.edge.aten.select_copy.int, - # Tensor combination - exir_ops.edge.aten.repeat.default, # Tensor creation exir_ops.edge.aten.arange.start_step, exir_ops.edge.aten.clone.default, @@ -561,6 +559,7 @@ def register_ported_op(features: OpFeatures): exir_ops.edge.aten.permute_copy.default, # Tensor combination exir_ops.edge.aten.cat.default, + exir_ops.edge.aten.repeat.default, exir_ops.edge.aten.split_with_sizes_copy.default, exir_ops.edge.aten.split.Tensor, ] diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl index 386be380072..c1b75ea8d0d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl @@ -45,7 +45,7 @@ const lowp int packed_dim = unhash_packed_dim(out_layout); ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -${layout_declare_spec_const(C, "bool", "repeat", "false")} +${layout_declare_spec_const(C, "int", "repeat", "0")} void no_repeat_copy(ivec3 pos) { // Position in input tensor @@ -229,7 +229,7 @@ void main() { return; } - if (repeat) { + if (repeat == 1) { repeat_copy(pos); } else { no_repeat_copy(pos); diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index d006dee74a7..ecc2faa392a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -151,7 +151,7 @@ void add_copy_packed_dim_offset_node( // Parameter buffers {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat}, + {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat ? 1 : 0}, nullptr, {}, { diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp index 3f4ed4f1090..38221e8a348 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp @@ -23,8 +23,7 @@ void check_args( const api::vTensor& in, const std::vector& repeats, const api::vTensor& out) { - VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); - VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); + VK_CHECK_COND(check_same_packed_dim(in, out)); VK_CHECK_COND(in.storage_type() == out.storage_type()); if (in.storage_type() == utils::kTexture2D) { @@ -59,147 +58,29 @@ void check_args( } // namespace -void add_repeat_channel_node( - ComputeGraph& graph, - ValueRef in, - int64_t repeat_channel, - ValueRef out, - utils::ivec3& running_range) { - vTensorPtr t_in = graph.get_tensor(in); - vTensorPtr t_out = graph.get_tensor(out); - - std::string kernel_name = "repeat_channel"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, *t_out); - - const std::vector& in_sizes = t_in->sizes(); - - int32_t in_width = utils::safe_downcast(dim_at(in_sizes)); - int32_t in_height = - utils::safe_downcast(dim_at(in_sizes)); - int32_t in_channel = - utils::safe_downcast(dim_at(in_sizes)); - int32_t in_batch = utils::safe_downcast(dim_at(in_sizes)); - - int32_t out_channel = repeat_channel * in_channel; - - utils::ivec4 out_whcn_sizes{in_width, in_height, out_channel, in_batch}; - - utils::ivec4 in_whcn_sizes{in_width, in_height, in_channel, in_batch}; - - // Channel packed global work ids - running_range[2] = out_whcn_sizes[3] * utils::div_up_4(out_whcn_sizes[2]); - utils::uvec3 global_size = utils::make_uvec3(running_range); - utils::uvec3 local_size = adaptive_work_group_size(global_size); - - const struct Block final { - utils::ivec4 out_sizes; - utils::ivec4 in_size; - } repeat_channel_args{ - out_whcn_sizes, - in_whcn_sizes, - }; - - auto shader = VK_KERNEL_FROM_STR(kernel_name); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_size, - local_size, - // Inputs and Outputs - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - // Parameter buffers - {graph.create_params_buffer(repeat_channel_args)}, - // Specialization Constants - {SV(t_out->packed_dim())})); -} - void add_repeat_node( ComputeGraph& graph, ValueRef in, ValueRef repeats_ref, ValueRef out) { - std::vector repeats = *(graph.get_int_list(repeats_ref)); + const std::vector repeats = *(graph.get_int_list(repeats_ref)); vTensorPtr t_in = graph.get_tensor(in); vTensorPtr t_out = graph.get_tensor(out); check_args(*t_in, repeats, *t_out); - // In this function, we expand the dimensions in the following order: - // 1. Channel - // 2. Width - // 3. Height - // 4. Batch - // After expanding a dimension, we will update the "running_range" since we - // will need to copy the "expanded" area. - - utils::ivec3 running_range = t_in->logical_limits(); - - const std::vector& in_sizes = t_in->sizes(); - - // Since we use channel packing, repeating the channel dimension is the most - // complicated and time-consuming, as we need to reason over misaligned - // channels. Hence we expand it first to minimize cost. Also, in this first - // dimension, we copy over the input texure to the output. In subsequent - // dimensions, we read and write from the same tensor. - - if (int64_t channel_repeat = dim_at(repeats); - channel_repeat == 1) { - // If no repeat, short-cut to a direct copy - utils::ivec4 src_offset{0, 0, 0, 0}; - utils::ivec4 dst_offset{0, 0, 0, 0}; - - add_copy_offset_node( - graph, in, running_range, src_offset, dst_offset, out, false, false); - - } else { - add_repeat_channel_node(graph, in, channel_repeat, out, running_range); - } - - // TODO: refactor width, height, and batch into a common helper function. - // Width - if (int64_t width_repeat = dim_at(repeats); width_repeat > 1) { - utils::ivec4 src_offset{0, 0, 0, 0}; - - for (int i = 1; i < width_repeat; ++i) { - utils::ivec4 dst_offset{i * dim_at(in_sizes), 0, 0, 0}; - - add_copy_offset_node( - graph, out, running_range, src_offset, dst_offset, out, true, false); - } - - running_range[0] = running_range[0] * width_repeat; - } - - // Height - if (int64_t height_repeat = dim_at(repeats); height_repeat > 1) { - utils::ivec4 src_offset{0, 0, 0, 0}; - - for (int i = 1; i < height_repeat; ++i) { - utils::ivec4 dst_offset = {0, i * dim_at(in_sizes), 0, 0}; - - add_copy_offset_node( - graph, out, running_range, src_offset, dst_offset, out, true, false); - } - - running_range[1] = running_range[1] * height_repeat; - } - - // Batch - if (int64_t batch_repeat = dim_at(repeats); batch_repeat > 1) { - utils::ivec4 src_offset{0, 0, 0, 0}; - - for (int i = 1; i < batch_repeat; ++i) { - utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0}; - - add_copy_offset_node( - graph, out, running_range, src_offset, dst_offset, out, true, false); - } - - running_range[2] = running_range[2] * batch_repeat; - } + const utils::ivec4 src_offset{ + dim_at(t_in->sizes()), + dim_at(t_in->sizes()), + dim_at(t_in->sizes()), + dim_at(t_in->sizes())}; + const utils::ivec4 dst_offset{ + dim_at(repeats), + dim_at(repeats), + dim_at(repeats), + dim_at(repeats)}; + add_copy_packed_dim_offset_node( + graph, in, t_out->logical_limits(), src_offset, dst_offset, out, true); } void repeat(ComputeGraph& graph, const std::vector& args) { diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 329d62c2285..d2e09404ca0 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -756,7 +756,11 @@ def get_repeat_inputs(): ((2, 3), [3, 1, 4]), ] ) - test_suite_2d.layouts = ["utils::kChannelsPacked"] + test_suite_2d.layouts = [ + "utils::kWidthPacked", + "utils::kHeightPacked", + "utils::kChannelsPacked", + ] test_suite_2d.storage_types = ["utils::kTexture2D"] test_suite_2d.data_gen = "make_seq_tensor" test_suite_2d.dtypes = ["at::kFloat"] @@ -797,7 +801,11 @@ def get_repeat_inputs(): ((2, 3), [3, 3, 2, 4]), ] ) - test_suite_3d.layouts = ["utils::kChannelsPacked"] + test_suite_3d.layouts = [ + "utils::kWidthPacked", + "utils::kHeightPacked", + "utils::kChannelsPacked", + ] test_suite_3d.storage_types = ["utils::kTexture3D"] test_suite_3d.data_gen = "make_seq_tensor" test_suite_3d.dtypes = ["at::kFloat"]