tu: Handle non-identity GMEM swaps when resolving

cwabbott0 · Marge Bot · commit a104a7ca1acf · 2025-01-23T05:53:40.000Z
There is a single swap field for each color attachment, regardless of whether it's in GMEM or not, and this does appear to be used in GMEM mode when MUTABLEEN is set on the attachment. This means that when a color attachment has a non-identity swap because it's mutable on a750, we have to use the same corresponding swap when it's a source in a GMEM resolve. When using the fastpath, we have to make sure that the swaps match because there aren't separate fields for GMEM and sysmem swap. This fixes dEQP-VK.image.mutable.2d.*_b8g8r8a8_unorm_draw_copy_resolve with TU_DEBUG=gmem. Fixes: 247d11d ("tu: Allow UBWC with images with swapped formats.") Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33115>
diff --git a/src/freedreno/fdl/fd6_view.c b/src/freedreno/fdl/fd6_view.c
@@ -402,6 +402,8 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
       tile_mode == TILE6_LINEAR && args->base_miplevel != layout->mip_levels - 1;
 
    view->ubwc_enabled = ubwc_enabled;
+   view->is_mutable = layout->is_mutable;
+   view->color_swap = color_swap;
 
    view->RB_MRT_BUF_INFO =
       A6XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
diff --git a/src/freedreno/fdl/freedreno_layout.h b/src/freedreno/fdl/freedreno_layout.h
@@ -297,6 +297,8 @@ struct fdl6_view {
    bool need_y2_align;
 
    bool ubwc_enabled;
+   bool is_mutable;
+   uint8_t color_swap;
 
    enum pipe_format format;
 
diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc
@@ -1334,7 +1334,9 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
    uint32_t desc[A6XX_TEX_CONST_DWORDS];
    memcpy(desc, iview->view.descriptor, sizeof(desc));
 
-   enum a6xx_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, true).fmt;
+   enum a6xx_format fmt =
+      blit_format_texture<CHIP>(format, TILE6_2,
+                                iview->view.is_mutable, true).fmt;
    fixup_src_format(&format, dst_format, &fmt);
 
    /* patch the format so that depth/stencil get the right format and swizzle */
@@ -1348,7 +1350,9 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
                A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
 
    /* patched for gmem */
-   desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
+   desc[0] &= ~A6XX_TEX_CONST_0_TILE_MODE__MASK;
+   if (!iview->view.is_mutable)
+      desc[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
    desc[2] =
       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
@@ -4856,7 +4860,8 @@ template <chip CHIP>
 static void
 store_cp_blit(struct tu_cmd_buffer *cmd,
               struct tu_cs *cs,
-              const struct tu_image_view *iview,
+              const struct tu_image_view *src_iview,
+              const struct tu_image_view *dst_iview,
               uint32_t samples,
               bool separate_stencil,
               enum pipe_format src_format,
@@ -4867,33 +4872,44 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
 {
    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
                           VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
-                          iview->view.ubwc_enabled, true);
+                          dst_iview->view.ubwc_enabled, true);
 
-   if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+   if (dst_iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       if (!separate_stencil) {
-         r2d_dst_depth(cs, iview, layer);
+         r2d_dst_depth(cs, dst_iview, layer);
       } else {
-         r2d_dst_stencil(cs, iview, layer);
+         r2d_dst_stencil(cs, dst_iview, layer);
       }
    } else {
-      r2d_dst<CHIP>(cs, &iview->view, layer, src_format);
+      r2d_dst<CHIP>(cs, &dst_iview->view, layer, src_format);
    }
 
-   enum a6xx_format fmt = blit_format_texture<CHIP>(src_format, TILE6_2, false, true).fmt;
-   fixup_src_format(&src_format, dst_format, &fmt);
+   /* Note: we compute the swap here instead of using the color_swap as
+    * programmed when we setup the color attachment because the attachment in
+    * GMEM ignores the swap except when MUTABLEEN is enabled. If the
+    * color attachment is linear, we need to use the identity swap even if the
+    * original attachment has a non-identity swap.
+    */
+   struct tu_native_format fmt =
+      blit_format_texture<CHIP>(src_format, TILE6_2,
+                                src_iview->view.is_mutable, true);
+   enum a6xx_format format = fmt.fmt;
+   fixup_src_format(&src_format, dst_format, &format);
 
    tu_cs_emit_regs(cs,
                    SP_PS_2D_SRC_INFO(CHIP,
-                      .color_format = fmt,
+                      .color_format = format,
                       .tile_mode = TILE6_2,
-                      .color_swap = WZYX,
+                      .color_swap = fmt.swap,
                       .srgb = util_format_is_srgb(src_format),
                       .samples = tu_msaa_samples(samples),
                       .samples_average = !util_format_is_pure_integer(dst_format) &&
                                          !util_format_is_depth_or_stencil(dst_format),
                       .unk20 = 1,
                       .unk22 = 1),
-                   SP_PS_2D_SRC_SIZE(CHIP, .width = iview->vk.extent.width, .height = iview->vk.extent.height),
+                   SP_PS_2D_SRC_SIZE(CHIP,
+                      .width = dst_iview->vk.extent.width,
+                      .height = dst_iview->vk.extent.height),
                    SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
 
@@ -4921,7 +4937,8 @@ template <chip CHIP>
 static void
 store_3d_blit(struct tu_cmd_buffer *cmd,
               struct tu_cs *cs,
-              const struct tu_image_view *iview,
+              const struct tu_image_view *src_iview,
+              const struct tu_image_view *dst_iview,
               VkSampleCountFlagBits dst_samples,
               bool separate_stencil,
               enum pipe_format src_format,
@@ -4949,21 +4966,21 @@ store_3d_blit(struct tu_cmd_buffer *cmd,
    }
 
    r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
-                   0, false, iview->view.ubwc_enabled, dst_samples);
+                   0, false, dst_iview->view.ubwc_enabled, dst_samples);
 
    r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
 
-   if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+   if (dst_iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       if (!separate_stencil) {
-         r3d_dst_depth<CHIP>(cs, iview, layer);
+         r3d_dst_depth<CHIP>(cs, dst_iview, layer);
       } else {
-         r3d_dst_stencil<CHIP>(cs, iview, layer);
+         r3d_dst_stencil<CHIP>(cs, dst_iview, layer);
       }
    } else {
-      r3d_dst<CHIP>(cs, &iview->view, layer, src_format);
+      r3d_dst<CHIP>(cs, &dst_iview->view, layer, src_format);
    }
 
-   r3d_src_gmem<CHIP>(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
+   r3d_src_gmem<CHIP>(cmd, cs, src_iview, src_format, dst_format, gmem_offset, cpp);
 
    /* sync GMEM writes with CACHE. */
    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
@@ -5033,6 +5050,29 @@ tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
            (y2 % phys_dev->info->gmem_align_h && need_y2_align));
 }
 
+/* The fast path cannot handle the corner case where GMEM and sysmem
+ * attachments have different swap if the GMEM attachment is mutable, which
+ * can happen when a mutable color attachment is being resolved into a
+ * non-mutable resolve attachment. In such a case, if the format is a swapped
+ * format like BGRA8, the color attachment will be stored in GMEM swapped but
+ * the resolve attachment in sysmem will not be swapped and there's no way to
+ * express that in the hardware because it computes the GMEM swap from the
+ * sysmem swap.
+ */
+static bool
+tu_attachment_store_mismatched_swap(struct tu_cmd_buffer *cmd, uint32_t a,
+                                    uint32_t gmem_a)
+{
+   if (a == gmem_a)
+      return false;
+
+   const struct tu_image_view *dst_iview = cmd->state.attachments[a];
+   const struct tu_image_view *src_iview = cmd->state.attachments[gmem_a];
+
+   return src_iview->view.is_mutable &&
+      dst_iview->view.color_swap != src_iview->view.color_swap;
+}
+
 /* Choose the GMEM layout (use the CCU space or not) based on whether the
  * current attachments will need.  This has to happen at vkBeginRenderPass()
  * time because tu_attachment_store_unaligned() looks at the image views, which
@@ -5062,6 +5102,21 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
    }
 
+   for (unsigned i = 0; i < cmd->state.pass->subpass_count; i++) {
+      const struct tu_subpass *subpass = &cmd->state.pass->subpasses[i];
+      for (unsigned j = 0; j < subpass->resolve_count; j++) {
+         uint32_t a = subpass->resolve_attachments[j].attachment;
+         if (a == VK_ATTACHMENT_UNUSED)
+            continue;
+         uint32_t gmem_a =
+            j == subpass->color_count ?
+               subpass->depth_stencil_attachment.attachment :
+               subpass->color_attachments[j].attachment;
+         if (tu_attachment_store_mismatched_swap(cmd, a, gmem_a))
+            cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
+      }
+   }
+
    cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
 }
 
@@ -5117,8 +5172,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
 {
    const VkRect2D *render_area = &cmd->state.render_area;
    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
-   const struct tu_image_view *iview = cmd->state.attachments[a];
+   const struct tu_image_view *dst_iview = cmd->state.attachments[a];
    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
+   const struct tu_image_view *src_iview = cmd->state.attachments[a];
    const VkClearValue *clear_value = &cmd->state.clear_values[gmem_a];
    bool resolve = a != gmem_a;
    if (resolve)
@@ -5128,6 +5184,7 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
       return;
 
    bool unaligned = tu_attachment_store_unaligned(cmd, a);
+   bool mismatched_swap = tu_attachment_store_mismatched_swap(cmd, a, gmem_a);
 
    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
     * one for depth and other for stencil. When resolving a MSAA
@@ -5147,7 +5204,7 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
    bool store_common = dst->store && !resolve_d32s8_s8;
    bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
 
-   bool use_fast_path = !unaligned && !resolve_d24s8_s8 &&
+   bool use_fast_path = !unaligned && !mismatched_swap && !resolve_d24s8_s8 &&
                         (a == gmem_a || blit_can_resolve(dst->format));
 
    trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
@@ -5163,9 +5220,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
    /* use fast path when render area is aligned, except for unsupported resolve cases */
    if (use_fast_path) {
       if (store_common)
-         tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, false);
+         tu_emit_blit<CHIP>(cmd, cs, resolve_group, dst_iview, src, clear_value, BLIT_EVENT_STORE, false);
       if (store_separate_stencil)
-         tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, true);
+         tu_emit_blit<CHIP>(cmd, cs, resolve_group, dst_iview, src, clear_value, BLIT_EVENT_STORE, true);
 
       if (cond_exec) {
          tu_end_load_store_cond_exec(cmd, cs, false);
@@ -5198,11 +5255,11 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
 
       for_each_layer(i, layer_mask, layers) {
          if (store_common) {
-            store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, false, src_format,
+            store_3d_blit<CHIP>(cmd, cs, src_iview, dst_iview, dst->samples, false, src_format,
                           dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
          }
          if (store_separate_stencil) {
-            store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
+            store_3d_blit<CHIP>(cmd, cs, src_iview, dst_iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
                           PIPE_FORMAT_S8_UINT, render_area, i,
                           tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
          }
@@ -5236,11 +5293,11 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                                          state);
          }
          if (store_common) {
-            store_cp_blit<CHIP>(cmd, cs, iview, src->samples, false, src_format,
+            store_cp_blit<CHIP>(cmd, cs, src_iview, dst_iview, src->samples, false, src_format,
                           dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
          }
          if (store_separate_stencil) {
-            store_cp_blit<CHIP>(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
+            store_cp_blit<CHIP>(cmd, cs, src_iview, dst_iview, src->samples, true, PIPE_FORMAT_S8_UINT,
                           PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
          }
       }