diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
index 25113887dca..327c3868847 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
@@ -27,6 +27,8 @@ ${layout_declare_ubo(B, "ivec4", "sizes")}
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 ${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
+
 const lowp ivec4 axis_map = unhash_axis_map(t_layout);
 const lowp int packed_dim = unhash_packed_dim(t_layout);
 
@@ -41,8 +43,23 @@ int extend_sign(int x) {
 }
 
 ivec4 read_texel(ivec4 tidx) {
+  ivec4 tidx_to_use = tidx;
+  ivec4 sizes_to_use = sizes;
+  int packed_dim_to_use = packed_dim;
+  if (transpose_hw == 1) {
+    sizes_to_use.xy = sizes_to_use.yx;
+    tidx_to_use.xy = tidx.yx;
+
+    if (packed_dim == 1) {
+      packed_dim_to_use = 0;
+    }
+    if (packed_dim == 0) {
+      packed_dim_to_use = 1;
+    }
+  }
+
   const ivec4 buf_indices = tidx_to_nchwi(
-      tidx, sizes, packed_dim);
+      tidx_to_use, sizes_to_use, packed_dim_to_use);
 
   int shift = (1 << 8) - 1;
   ivec4 masks;
@@ -70,7 +87,7 @@ ivec4 read_texel(ivec4 tidx) {
 
 void main() {
   const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
+  ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
 
   if (any(greaterThanEqual(tidx, sizes))) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
index bf498f34d5b..32235a9ad65 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -21,6 +21,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 // This constant is unused in this shader but is kept so that the signature is
 // consistent with nchw_to_image.
 ${layout_declare_spec_const(C, "int", "UNUSED_layout", "0")}
+${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
 
 void main() {
   int out_bufi = int(gl_GlobalInvocationID.x);
@@ -29,7 +30,13 @@ void main() {
   }
 
   ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides);
-  const int in_nchwi = tidx_to_nchwi(out_tidx, out_sizes);
+
+  ivec4 sizes = out_sizes;
+  if (transpose_hw == 1) {
+    sizes.xy = sizes.yx;
+    out_tidx.xy = out_tidx.yx;
+  }
+  const int in_nchwi = tidx_to_nchwi(out_tidx, sizes);
 
   t_out[out_bufi] = nchw_in[in_nchwi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index 3d2a102dac7..2f55535c82c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -30,14 +30,31 @@ $if not FROM_STAGING:
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 ${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
+
 const lowp ivec4 axis_map = unhash_axis_map(t_layout);
 const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 VEC4_T read_texel(ivec4 tidx) {
+  ivec4 tidx_to_use = tidx;
+  ivec4 sizes_to_use = sizes;
+  int packed_dim_to_use = packed_dim;
+  if (transpose_hw == 1) {
+    sizes_to_use.xy = sizes_to_use.yx;
+    tidx_to_use.xy = tidx.yx;
+
+    if (packed_dim == 1) {
+      packed_dim_to_use = 0;
+    }
+    if (packed_dim == 0) {
+      packed_dim_to_use = 1;
+    }
+  }
+
   $if FROM_STAGING:
-    const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim);
+    const ivec4 buf_indices = tidx_to_nchwi(tidx_to_use, sizes_to_use, packed_dim_to_use);
   $else:
-    const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim);
+    const ivec4 buf_indices = tidx_to_4bufi(tidx_to_use, buf_strides, packed_dim_to_use);
 
   VEC4_T texel = VEC4_T(0);
   if (tidx[packed_dim] < sizes[packed_dim]) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index 56bffaee675..228e2e8f870 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -64,24 +64,21 @@ void main() {
 
   FLOAT_T outval = FLOAT_T(0.0);
 
-  // Initial mat1 tensor idx will be (0, out_tidx.y, out_tidx.z, 0)
   int mat1_offset = out_tidx.y * mat1_strides.y + out_tidx.z * qmat2_strides.z;
-  // Initial qmat2 tensor idx wil be (0, out_tidx.x, 0, 0); note that the qmat2
-  // tensor is transposed
-  int qmat2_offset = out_tidx.x * qmat2_strides.y;
+  int qmat2_offset = out_tidx.x;
 
   // TODO(ssjia): optimize memory access pattern by traversing mat1 x in inner loop
   for (int i = 0; i < mat1_sizes.x; i++) {
     const FLOAT_T mat1_val = t_mat1[mat1_offset];
-    const FLOAT_T mat2_val = t_qmat2[qmat2_offset] * scale;
+    const FLOAT_T mat2_val = FLOAT_T(t_qmat2[qmat2_offset]);
 
     outval += mat1_val * mat2_val;
 
     mat1_offset++;
-    qmat2_offset++;
+    qmat2_offset += qmat2_strides.y;
   }
 
-  t_out[out_bufi] = outval;
+  t_out[out_bufi] = outval * scale;
 }
 
 #else // USING_TEXTURE
@@ -97,25 +94,27 @@ void main() {
     return;
   }
 
-  const uint16_t qmat2_pos_y = out_pos.x * uint16_t(4);
+  const uint16_t qmat2_pos_x = out_pos.x;
 
   VEC4_T outtex = VEC4_T(0);
 
   const VEC4_T scales = load_texel(t_scales,  u16vec3(out_pos.x, 0, 0));
 
+  VEC4_T mat1_tex;
+  VEC4_T mat2_tex[4];
   for (
     uint16_t i = uint16_t(0), x = uint16_t(0);
     i < uint16_t(mat1_sizes.x);
     i += uint16_t(4), x++)
   {
-    const VEC4_T mat1_tex = load_texel(t_mat1, u16vec3(x, out_pos.y, 0));
-    const VEC4_T sums = VEC4_T(
-        dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y, 0))),
-        dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y + uint16_t(1), 0))),
-        dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y + uint16_t(2), 0))),
-        dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y + uint16_t(3), 0))));
-
-    outtex += sums;
+    mat1_tex = load_texel(t_mat1, u16vec3(x, out_pos.y, 0));
+
+    mat2_tex[0] = load_texel(t_qmat2, u16vec3(out_pos.x, i, 0));
+    mat2_tex[1] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(1), 0));
+    mat2_tex[2] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(2), 0));
+    mat2_tex[3] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(3), 0));
+
+    outtex += mat1_tex.x * mat2_tex[0] + mat1_tex.y * mat2_tex[1] + mat1_tex.z * mat2_tex[2] + mat1_tex.w * mat2_tex[3];
   }
 
   outtex *= scales;
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
index 59684d73bd2..2011331ec38 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -48,7 +48,7 @@ void resize_q_8w_linear_node(
   vTensorPtr qmat2 = graph->get_tensor(args[1].refs[1]);
 
   const int out_cols = utils::val_at(-2, mat1->sizes());
-  const int out_rows = utils::val_at(-2, qmat2->sizes());
+  const int out_rows = utils::val_at(-1, qmat2->sizes());
 
   std::vector<int64_t> new_out_sizes(3);
   if (mat1->sizes().size() == 2) {
@@ -86,7 +86,7 @@ void add_q_8w_linear_node(
     // Ensure out is packed correctly
     out_W_packed = out_tmp;
   }
-  ValueRef q_mat2 = prepack_standard(
+  ValueRef q_mat2 = prepack_standard_hw_transposed(
       graph, q_mat2_data, graph.storage_type_of(out), utils::kWidthPacked);
   ValueRef scales = prepack_standard(
       graph, scales_data, graph.storage_type_of(out), utils::kWidthPacked);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 959d3974b73..f59d1cd65d9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -113,7 +113,8 @@ void add_tensor_to_staging_node(
 void add_prepack_standard_node(
     ComputeGraph& graph,
     const ValueRef tensor_data,
-    const ValueRef tensor) {
+    const ValueRef tensor,
+    const bool transpose_hw = false) {
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       *graph.get_tensor(tensor), graph.int8_buffers_enabled());
 
@@ -127,6 +128,8 @@ void add_prepack_standard_node(
     ubos.append({graph.sizes_ubo(tensor)});
   }
 
+  int transpose_hw_spec = transpose_hw ? 1 : 0;
+
   graph.prepack_nodes().emplace_back(new PrepackNode(
       graph,
       shader,
@@ -138,7 +141,7 @@ void add_prepack_standard_node(
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {graph.hashed_layout_of(tensor)}));
+      {graph.hashed_layout_of(tensor), transpose_hw_spec}));
 }
 
 ValueRef prepack_standard(
@@ -158,6 +161,33 @@ ValueRef prepack_standard(
   return tensor;
 }
 
+ValueRef prepack_standard_hw_transposed(
+    ComputeGraph& graph,
+    const ValueRef tensor_data,
+    const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout layout,
+    const bool passthrough,
+    const utils::AxisMapLayout axis_map_layout) {
+  (void)passthrough;
+
+  VK_CHECK_COND(graph.val_is_tref(tensor_data));
+  std::vector<int64_t> new_out_sizes = graph.sizes_of(tensor_data);
+  const int w_dim = new_out_sizes.size() - 1;
+  const int h_dim = new_out_sizes.size() - 2;
+  const int64_t tmp = new_out_sizes.at(w_dim);
+  new_out_sizes.at(w_dim) = new_out_sizes.at(h_dim);
+  new_out_sizes.at(h_dim) = tmp;
+  ValueRef tensor = graph.add_tensor(
+      new_out_sizes,
+      graph.dtype_of(tensor_data),
+      storage_type,
+      layout,
+      -1,
+      axis_map_layout);
+  add_prepack_standard_node(graph, tensor_data, tensor, true);
+  return tensor;
+}
+
 ValueRef prepack_standard_like(
     ComputeGraph& graph,
     const ValueRef tensor_data,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
index bc501d5d053..1b6f245bd34 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h
@@ -51,6 +51,18 @@ ValueRef prepack_standard(
     const bool passthrough = false,
     const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
 
+/*
+ * Same as prepack_standard, but transpose the height and width dimensions of
+ * the tensor while packing.
+ */
+ValueRef prepack_standard_hw_transposed(
+    ComputeGraph& graph,
+    const ValueRef tensor_data,
+    const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout layout,
+    const bool passthrough = false,
+    const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
+
 /*
  * Equivalent to `prepack_standard()` function, except the `storage_type` and
  * `memory_layout` are set to match `to_copy`, which must be a `Tensor`.
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 41d8edf1f25..329d62c2285 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -157,12 +157,14 @@ def get_weight_int8pack_mm_inputs():
         [6, 1024, 256],
         [6, 256, 256],
         [6, 256, 512],
+        [4, 768, 4096],
+        [1024, 1024, 1024],
     ]
 
     inputs_list = [((M, K), (N, K), (N)) for M, K, N in MKN_list]
 
     test_suite = VkTestSuite(inputs_list)
-    test_suite.dtypes = ["at::kFloat", "at::kHalf"]
+    test_suite.dtypes = ["at::kFloat"]
     test_suite.layouts = ["utils::kWidthPacked"]
     test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     test_suite.prepacked_args = ["mat2", "scales"]