Skip to content

Modifying slice op to support all tensor packing. #9030

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,6 @@ def register_view_op(features: OpFeatures):
exir_ops.edge.aten.flip.default,
exir_ops.edge.aten.index_select.default,
exir_ops.edge.aten.select_copy.int,
exir_ops.edge.aten.slice_copy.Tensor,
# Tensor combination
exir_ops.edge.aten.cat.default,
exir_ops.edge.aten.split_with_sizes_copy.default,
Expand All @@ -557,6 +556,19 @@ def register_ported_op(features: OpFeatures):
return features


@update_features(
[
# Indexing and lookup
exir_ops.edge.aten.slice_copy.Tensor,
]
)
def register_ported_op_all_packed_dims(features: OpFeatures):
features.texture_impl = TextureImplFeatures(
valid_packed_dims=all_packed_dims,
)
return features


# Ported ops that support their own prepacking.
@update_features(
[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg {
int dim;
int offset;
int step;
// Used when dim=batch. Stride is the # of plances for each batch value.
int stride;
int image_in_channel_size;
}
slice_arg;

Expand All @@ -45,11 +44,24 @@ void main() {

ivec3 in_pos = pos;

int index = pos[slice_arg.dim] / slice_arg.stride;
int within_stride = pos[slice_arg.dim] % slice_arg.stride;

in_pos[slice_arg.dim] = slice_arg.offset * slice_arg.stride + index * slice_arg.step *
slice_arg.stride + within_stride;
// slice along batch axis
if (slice_arg.dim == 3) {
// index of the channel inside a batch
const int chanl_index = pos.z % slice_arg.image_in_channel_size;
// index of batch
const int batch_index = pos.z / slice_arg.image_in_channel_size;
in_pos.z = (slice_arg.offset + batch_index * slice_arg.step) * slice_arg.image_in_channel_size + chanl_index;
} else if (slice_arg.dim == C_DIM) {
// index of the channel inside a batch
const int chanl_index = pos.z % sizes.z;
// index of batch
const int batch_index = pos.z / sizes.z;
in_pos.z = slice_arg.offset + batch_index * slice_arg.image_in_channel_size + chanl_index * slice_arg.step;
} else if (slice_arg.dim == H_DIM) {
in_pos.y = slice_arg.offset + pos.y * slice_arg.step;
} else {
in_pos.x = slice_arg.offset + pos.x * slice_arg.step;
}

imageStore(image_out, pos, texelFetch(image_in, in_pos, 0));

Expand Down
4 changes: 2 additions & 2 deletions backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ void main() {
for (int i=0;i<4;i++) {
ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes);

int in_channel = user_coor.z;
int in_dim = user_coor[packed_dim];

ivec4 in_user_coor = user_coor;
in_user_coor.z = slice_arg.offset + in_channel * slice_arg.step;
in_user_coor[packed_dim] = slice_arg.offset + in_dim * slice_arg.step;

ivec4 in_pow_elem = to_texture_elem_pos(
in_user_coor,
Expand Down
39 changes: 25 additions & 14 deletions backends/vulkan/runtime/graph/ops/impl/Slice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ void add_slice_tensor_copy_node(
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));

// Need normalize the dim
int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
Expand Down Expand Up @@ -76,7 +75,13 @@ void add_slice_tensor_copy_node(
start = normalize_idx(start, in_sizes[dim], 0);
end = normalize_idx(end, in_sizes[dim], in_sizes[dim]);

if (dim_index == kChannel4D) {
const vkapi::SpecVarList spec_vars = {t_in->packed_dim()};

const auto packed_dim_idx =
static_cast<DimIndex>(DimIndex::DIM_LAST - t_in->packed_dim());

// if slice dim is the same as the packed dim, we can use the channel slice
if (dim_index == packed_dim_idx) {
// slice by channel
std::string kernel_name = "slice_channel";
kernel_name.reserve(kShaderNameReserve);
Expand All @@ -99,26 +104,31 @@ void add_slice_tensor_copy_node(
{in, vkapi::MemoryAccessType::READ}},
{t_out->sizes_ubo(),
t_in->sizes_ubo(),
graph.create_params_buffer(params)}));
graph.create_params_buffer(params)},
spec_vars));

} else {
// GPU's coordinate is in x, y, z
int64_t gpu_dim = -1;
int64_t stride = 1;
int64_t in_channel_stride = 1;
if (dim_index == kWidth4D) {
gpu_dim = 0; // width: x dimension in gpu
VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
} else if (dim_index == kHeight4D) {
gpu_dim = 1; // height: y dimension
VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
} else if (dim_index == kBatch4D) {
gpu_dim = 2; // batch: z dimension

// Due to channel packing, each batch value is span over stride planes
int64_t n_channels = dim_at(in_sizes, kChannel4D);
stride = utils::div_up_4(n_channels);
} else if (dim_index == kChannel4D) {
gpu_dim = 2; // channel: z dimension
VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
in_channel_stride = dim_at(in_sizes, kChannel4D);
} else {
VK_THROW("Unexpected ncwh_dim!");
gpu_dim = 3; // batch: w dimension

in_channel_stride = dim_at(in_sizes, kChannel4D);
if (packed_dim_idx == kChannel4D) {
// Due to channel packing, each batch value is span over stride planes
in_channel_stride = utils::div_up_4(in_channel_stride);
}
}

std::string kernel_name = "slice_batch_height_width";
Expand All @@ -137,7 +147,7 @@ void add_slice_tensor_copy_node(
static_cast<int32_t>(gpu_dim),
static_cast<int32_t>(start),
static_cast<int32_t>(step),
static_cast<int32_t>(stride),
static_cast<int32_t>(in_channel_stride),
};

graph.execute_nodes().emplace_back(new DispatchNode(
Expand All @@ -147,7 +157,8 @@ void add_slice_tensor_copy_node(
local_size,
{{out, vkapi::MemoryAccessType::WRITE},
{in, vkapi::MemoryAccessType::READ}},
{t_out->sizes_ubo(), graph.create_params_buffer(params)}));
{t_out->sizes_ubo(), graph.create_params_buffer(params)},
spec_vars));
}
}

Expand Down
6 changes: 5 additions & 1 deletion backends/vulkan/test/op_tests/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,11 @@ def get_slice_out_inputs():
test_suite = VkTestSuite([tuple(tc) for tc in test_cases])

test_suite.dtypes = ["at::kFloat", "at::kHalf"]
test_suite.layouts = ["utils::kChannelsPacked"]
test_suite.layouts = [
"utils::kWidthPacked",
"utils::kHeightPacked",
"utils::kChannelsPacked",
]
test_suite.data_gen = "make_seq_tensor"
return test_suite

Expand Down
Loading