pytorch
diff --git a/‎backends/vulkan/test/glsl/all_shaders.yaml
+42 b/‎backends/vulkan/test/glsl/all_shaders.yaml
+42
diff --git a/‎backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
+35 b/‎backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
+35
diff --git a/‎backends/vulkan/test/glsl/common.h b/‎backends/vulkan/test/glsl/common.h
diff --git a/‎backends/vulkan/test/glsl/fill_texture__test.glsl
+28 b/‎backends/vulkan/test/glsl/fill_texture__test.glsl
+28
diff --git a/‎backends/vulkan/test/glsl/image_to_nchw__test.glsl
+54 b/‎backends/vulkan/test/glsl/image_to_nchw__test.glsl
+54
diff --git a/‎backends/vulkan/test/glsl/indexing_utils.h
+14 b/‎backends/vulkan/test/glsl/indexing_utils.h
+14
diff --git a/‎backends/vulkan/test/glsl/nchw_to_image__test.glsl
+56 b/‎backends/vulkan/test/glsl/nchw_to_image__test.glsl
+56
@@ -0,0 +1,42 @@
+binary_op_nobroadcast__test:
+  parameter_names_with_default_values:
+    OPERATOR: X + Y
+  shader_variants:
+    - NAME: binary_add_nobroadcast__test
+      OPERATOR: X + Y
+    - NAME: binary_sub_nobroadcast__test
+      OPERATOR: X - Y
+    - NAME: binary_mul_nobroadcast__test
+      OPERATOR: X * Y
+    - NAME: binary_div_nobroadcast__test
+      OPERATOR: X / Y
+    - NAME: binary_pow_nobroadcast__test
+      OPERATOR: pow(X, Y)
+
+image_to_nchw__test:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: image3d_to_nchw__test_C_packed
+
+nchw_to_image__test:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: nchw_to_image3d__test_C_packed
@@ -0,0 +1,35 @@
+#version 450 core
+// clang-format off
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+#define OP(X, Y) ${OPERATOR}
+// clang-format on
+
+layout(std430) buffer;
+
+// clang-format off
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D image_out;
+// clang-format on
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D image_other;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+    return;
+  }
+
+  vec4 in_texel = texelFetch(image_in, pos, 0);
+  vec4 other_texel = texelFetch(image_other, pos, 0);
+
+  imageStore(image_out, pos, OP(in_texel, other_texel));
+}
@@ -0,0 +1,28 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+// clang-format off
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+// clang-format on
+layout(set = 0, binding = 1) uniform PRECISION restrict Block {
+  ivec3 size;
+  int fill;
+  vec4 vals;
+} params;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, params.size))) {
+    return;
+  }
+
+  imageStore(uOutput, pos, params.vals);
+}
@@ -0,0 +1,54 @@
+#version 450 core
+// clang-format off
+#define PRECISION ${PRECISION}
+// clang-format on
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
+layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer {
+  ${T[DTYPE]} data[];
+}
+buffer_out;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
+  ivec4 data;
+}
+gpu_sizes;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
+  ivec4 data;
+}
+cpu_sizes;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+
+  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+    return;
+  }
+
+  const ${VEC4_T[DTYPE]} intex = texelFetch(image_in, pos, 0);
+
+  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
+
+  if (coord.z < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.x] = intex.x;
+  }
+  if (coord.z + 1 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.y] = intex.y;
+  }
+  if (coord.z + 2 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.z] = intex.z;
+  }
+  if (coord.z + 3 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.w] = intex.w;
+  }
+}
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#define POS_TO_COORD_CHANNELS_PACKED(pos, sizes) \
+  ivec4(pos.x, pos.y, (pos.z * 4) % sizes.z, (pos.z * 4) / sizes.z)
+
+#define COORD_TO_BUFFER_IDX(coord, sizes)                  \
+  coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
+      coord.w* sizes.z* sizes.y* sizes.x;
@@ -0,0 +1,56 @@
+#version 450 core
+// clang-format off
+#define PRECISION ${PRECISION}
+// clang-format on
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+// clang-format off
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+// clang-format on
+layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
+  ${T[DTYPE]} data[];
+}
+buffer_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
+  ivec4 data;
+}
+gpu_sizes;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
+  ivec4 data;
+}
+cpu_sizes;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+
+  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+    return;
+  }
+
+  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
+
+  ${T[DTYPE]} val_x = buffer_in.data[buf_indices.x];
+  ${T[DTYPE]} val_y = buffer_in.data[buf_indices.y];
+  ${T[DTYPE]} val_z = buffer_in.data[buf_indices.z];
+  ${T[DTYPE]} val_w = buffer_in.data[buf_indices.w];
+
+  ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w);
+
+  if (coord.z + 3 >= cpu_sizes.data.z) {
+    ivec4 c_ind = ivec4(coord.z) + ivec4(0, 1, 2, 3);
+    vec4 valid_c = vec4(lessThan(c_ind, ivec4(cpu_sizes.data.z)));
+    texel = texel * valid_c;
+  }
+
+  imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel);
+}