[AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions #100485

jwanggit86 · 2024-07-24T23:58:29Z

In v_dot4 and v_dot8 instructions with 4- or 8-bit packed data (e.g., v_dot4_u32_u8, v_dot8_u32_u4), the op_sel modifier should not be included.

llvmbot · 2024-07-24T23:58:49Z

@llvm/pr-subscribers-mc

Author: Jun Wang (jwanggit86)

Changes

In v_dot4 and v_dot8 instructions with 4- or 8-bit packed data (e.g., v_dot4_u32_u8, v_dot8_u32_u4), the op_sel modifier should not be included.

Patch is 100.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100485.diff

11 Files Affected:

(modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+5-5)
(modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+6-5)
(modified) llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir (+3-3)
(modified) llvm/test/MC/AMDGPU/dl-insts.s (-192)
(modified) llvm/test/MC/AMDGPU/gfx1030_err.s (+362)
(added) llvm/test/MC/AMDGPU/gfx11_asm_vop3p_err.s (+219)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s (+218)
(added) llvm/test/MC/AMDGPU/gfx908_err.s (+436)
(modified) llvm/test/MC/AMDGPU/gfx90a_err.s (+435)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx908-dl-insts.txt (-276)

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f4d2c29158f49..942f1dc251766 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -384,19 +384,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
 let OtherPredicates = [HasDot7Insts] in {
 let IsInvalidSingleUseConsumer = 1 in {
   defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
-    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot4, 1>;
 }
 defm V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
-  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot8, 1>;
 } // End OtherPredicates = [HasDot7Insts]
 
 let OtherPredicates = [HasDot1Insts] in {
 let IsInvalidSingleUseConsumer = 1 in {
   defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
-    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot4, 1>;
 }
 defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
-  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot8, 1>;
 } // End OtherPredicates = [HasDot1Insts]
 
 def DOT2_BF16_Profile
@@ -415,7 +415,7 @@ defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile,
 
 multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
   let IsDOT = 1 in
-  defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>,
+  defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>,
                         null_frag, 1>;
   // Dot-iu instructions consider input as signed if imod neg bits are set. Thus
   // Dot-iu Intrinsics have extra operands and require separate codegen pattern.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f2ed17ac305a1..26bf6d6ff7b89 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1253,11 +1253,12 @@ class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
   bit IsMAI = MAI;
 }
 
-def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
-def VOP3_CLAMP   : VOP3Features<1, 0, 0, 0>;
-def VOP3_OPSEL   : VOP3Features<1, 1, 0, 0>;
-def VOP3_PACKED  : VOP3Features<1, 1, 1, 0>;
-def VOP3_MAI     : VOP3Features<0, 0, 0, 1>;
+def VOP3_REGULAR         : VOP3Features<0, 0, 0, 0>;
+def VOP3_CLAMP           : VOP3Features<1, 0, 0, 0>;
+def VOP3_OPSEL           : VOP3Features<1, 1, 0, 0>;
+def VOP3_PACKED          : VOP3Features<1, 1, 1, 0>;
+def VOP3_PACKED_NO_OPSEL : VOP3Features<1, 0, 1, 0>;
+def VOP3_MAI             : VOP3Features<0, 0, 0, 1>;
 
 class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
 
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 8be7308c8a6e0..3feccff715bc1 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -746,7 +746,7 @@ name:            smfma4x4_write_vgpr_dot_write
 body:             |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: smfma4x4_read_srcc_vgpr_valu_write
 # GCN:      V_MFMA
@@ -945,7 +945,7 @@ name:            dot_write_vgpr_different_dot_read_srcc
 body:             |
   bb.0:
     $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
-    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: dot_write_vgpr_different_dot_write
 # GCN:      V_DOT
@@ -955,7 +955,7 @@ name:            dot_write_vgpr_different_dot_write
 body:             |
   bb.0:
     $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
-    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: dot_write_vgpr_different_valu_read
 # GCN:      V_DOT
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index a98b02d792d98..9fb6a7b6b5e55 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -1014,7 +1014,7 @@ name:            xdl_smfma4x4_write_vgpr_dot_write
 body:             |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: nonxdl_smfma4x4_read_srcc_vgpr_valu_write
 # GCN:      V_MFMA
@@ -1208,7 +1208,7 @@ name:            dot_write_vgpr_different_dot_read_srcc
 body:             |
   bb.0:
     $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
-    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: dot_write_vgpr_different_dot_write
 # GCN:      V_DOT
@@ -1218,7 +1218,7 @@ name:            dot_write_vgpr_different_dot_write
 body:             |
   bb.0:
     $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
-    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: dot_write_vgpr_different_valu_read
 # GCN:      V_DOT
diff --git a/llvm/test/MC/AMDGPU/dl-insts.s b/llvm/test/MC/AMDGPU/dl-insts.s
index 00e9bec7eb0a2..599734aac829d 100644
--- a/llvm/test/MC/AMDGPU/dl-insts.s
+++ b/llvm/test/MC/AMDGPU/dl-insts.s
@@ -536,198 +536,6 @@ v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
 v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
 // CHECK: encoding: [0x00,0x18,0xa7,0xd3,0x01,0x05,0x0e,0x1c]
 v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
 
 //
 // Test clamp.
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index 51498d3c86d7f..87a09875f75e9 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -211,3 +211,365 @@ image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12
 // missing dim
 image_msaa_load v[1:4], v[5:7], s[8:15] dmask:0xf glc
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+// op_sel not allowed in dot opcodes with 4- or 8-bit packed data
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i...
[truncated]

llvmbot · 2024-07-24T23:58:50Z

@llvm/pr-subscribers-backend-amdgpu

Author: Jun Wang (jwanggit86)

Changes

In v_dot4 and v_dot8 instructions with 4- or 8-bit packed data (e.g., v_dot4_u32_u8, v_dot8_u32_u4), the op_sel modifier should not be included.

Patch is 100.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100485.diff

11 Files Affected:

(modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+5-5)
(modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+6-5)
(modified) llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir (+3-3)
(modified) llvm/test/MC/AMDGPU/dl-insts.s (-192)
(modified) llvm/test/MC/AMDGPU/gfx1030_err.s (+362)
(added) llvm/test/MC/AMDGPU/gfx11_asm_vop3p_err.s (+219)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s (+218)
(added) llvm/test/MC/AMDGPU/gfx908_err.s (+436)
(modified) llvm/test/MC/AMDGPU/gfx90a_err.s (+435)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx908-dl-insts.txt (-276)

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f4d2c29158f49..942f1dc251766 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -384,19 +384,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
 let OtherPredicates = [HasDot7Insts] in {
 let IsInvalidSingleUseConsumer = 1 in {
   defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
-    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot4, 1>;
 }
 defm V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
-  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot8, 1>;
 } // End OtherPredicates = [HasDot7Insts]
 
 let OtherPredicates = [HasDot1Insts] in {
 let IsInvalidSingleUseConsumer = 1 in {
   defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
-    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot4, 1>;
 }
 defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
-  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot8, 1>;
 } // End OtherPredicates = [HasDot1Insts]
 
 def DOT2_BF16_Profile
@@ -415,7 +415,7 @@ defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile,
 
 multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
   let IsDOT = 1 in
-  defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>,
+  defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>,
                         null_frag, 1>;
   // Dot-iu instructions consider input as signed if imod neg bits are set. Thus
   // Dot-iu Intrinsics have extra operands and require separate codegen pattern.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f2ed17ac305a1..26bf6d6ff7b89 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1253,11 +1253,12 @@ class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
   bit IsMAI = MAI;
 }
 
-def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
-def VOP3_CLAMP   : VOP3Features<1, 0, 0, 0>;
-def VOP3_OPSEL   : VOP3Features<1, 1, 0, 0>;
-def VOP3_PACKED  : VOP3Features<1, 1, 1, 0>;
-def VOP3_MAI     : VOP3Features<0, 0, 0, 1>;
+def VOP3_REGULAR         : VOP3Features<0, 0, 0, 0>;
+def VOP3_CLAMP           : VOP3Features<1, 0, 0, 0>;
+def VOP3_OPSEL           : VOP3Features<1, 1, 0, 0>;
+def VOP3_PACKED          : VOP3Features<1, 1, 1, 0>;
+def VOP3_PACKED_NO_OPSEL : VOP3Features<1, 0, 1, 0>;
+def VOP3_MAI             : VOP3Features<0, 0, 0, 1>;
 
 class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
 
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 8be7308c8a6e0..3feccff715bc1 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -746,7 +746,7 @@ name:            smfma4x4_write_vgpr_dot_write
 body:             |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: smfma4x4_read_srcc_vgpr_valu_write
 # GCN:      V_MFMA
@@ -945,7 +945,7 @@ name:            dot_write_vgpr_different_dot_read_srcc
 body:             |
   bb.0:
     $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
-    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: dot_write_vgpr_different_dot_write
 # GCN:      V_DOT
@@ -955,7 +955,7 @@ name:            dot_write_vgpr_different_dot_write
 body:             |
   bb.0:
     $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
-    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: dot_write_vgpr_different_valu_read
 # GCN:      V_DOT
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index a98b02d792d98..9fb6a7b6b5e55 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -1014,7 +1014,7 @@ name:            xdl_smfma4x4_write_vgpr_dot_write
 body:             |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: nonxdl_smfma4x4_read_srcc_vgpr_valu_write
 # GCN:      V_MFMA
@@ -1208,7 +1208,7 @@ name:            dot_write_vgpr_different_dot_read_srcc
 body:             |
   bb.0:
     $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
-    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: dot_write_vgpr_different_dot_write
 # GCN:      V_DOT
@@ -1218,7 +1218,7 @@ name:            dot_write_vgpr_different_dot_write
 body:             |
   bb.0:
     $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
-    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
 ...
 # GCN-LABEL: name: dot_write_vgpr_different_valu_read
 # GCN:      V_DOT
diff --git a/llvm/test/MC/AMDGPU/dl-insts.s b/llvm/test/MC/AMDGPU/dl-insts.s
index 00e9bec7eb0a2..599734aac829d 100644
--- a/llvm/test/MC/AMDGPU/dl-insts.s
+++ b/llvm/test/MC/AMDGPU/dl-insts.s
@@ -536,198 +536,6 @@ v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
 v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
 // CHECK: encoding: [0x00,0x18,0xa7,0xd3,0x01,0x05,0x0e,0x1c]
 v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
 
 //
 // Test clamp.
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index 51498d3c86d7f..87a09875f75e9 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -211,3 +211,365 @@ image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12
 // missing dim
 image_msaa_load v[1:4], v[5:7], s[8:15] dmask:0xf glc
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+// op_sel not allowed in dot opcodes with 4- or 8-bit packed data
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i...
[truncated]

arsenm

I believe this is only true for gfx940, and I think the older targets did support op_sel with these. I see a note that not supporting op_sel is a change, but don't see documentation for what it did before

DadSchoorse · 2024-07-25T17:45:20Z

but don't see documentation for what it did before

op_sel/op_sel_hi on src0/1 works the same as for other packed instructions on gfx10-11, it selects which half of the source is read. src2 op_sel is ignored.

jwanggit86 · 2024-07-25T18:07:44Z

BTW, this is for issue 44257.

jwanggit86 · 2024-08-02T21:54:18Z

and I think the older targets did support op_sel with these.

These instructions (v_dot4, v_dot8) have fixed ways of using the 4-bit/8-bit data, so op_sel isn't really meaningful?

jwanggit86 · 2024-08-12T21:48:54Z

I believe this is only true for gfx940, and I think the older targets did support op_sel with these. I see a note that not supporting op_sel is a change, but don't see documentation for what it did before

@arsenm
Is one of the following the note you referred to above?

AMDGPUInstructionSelector::selectVOP3PModsImpl(
  Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
  ...
  (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
}

class GCNSubtarget {
...
  // Cannot use op_sel with v_dot instructions.
  bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
}

Even these two places are inconsistent. One says "do not use on gfx940+", another seems to apply only to gfx940 and gfx11.

Second, are you suggesting op_sel should be disallowed for gfx940+, but left alone for older targets?

arsenm · 2024-08-16T18:35:03Z

I believe this is only true for gfx940, and I think the older targets did support op_sel with these. I see a note that not supporting op_sel is a change, but don't see documentation for what it did before

@arsenm Is one of the following the note you referred to above?

No, I'm reading the ISA documentation, not the implementation

Even these two places are inconsistent. One says "do not use on gfx940+", another seems to apply only to gfx940 and gfx11.

This looks like it wasn't cleanly implemented to track which targets support it and which don't. I wouldn't trust whatever is implemented here, this requires double checking the hardware docs

Second, are you suggesting op_sel should be disallowed for gfx940+, but left alone for older targets?

It should be left alone on any target where it's supported. We shouldn't just drop it everywhere

jwanggit86 · 2024-08-23T18:20:32Z

I believe this is only true for gfx940, and I think the older targets did support op_sel with these. I see a note that not supporting op_sel is a change, but don't see documentation for what it did before

@arsenm Is one of the following the note you referred to above?

No, I'm reading the ISA documentation, not the implementation

Even these two places are inconsistent. One says "do not use on gfx940+", another seems to apply only to gfx940 and gfx11.

This looks like it wasn't cleanly implemented to track which targets support it and which don't. I wouldn't trust whatever is implemented here, this requires double checking the hardware docs

Second, are you suggesting op_sel should be disallowed for gfx940+, but left alone for older targets?

It should be left alone on any target where it's supported. We shouldn't just drop it everywhere

@arsenm I checked the documentation. For gfx940 it is explicitly stated that op_sel is not allowed. However, it is vague (i.e., not explicitly said) for other targets. So, should we just disallow it for gfx940 for now?

jwanggit86 · 2024-09-04T18:09:45Z

@arsenm ping.

Sisyph · 2024-10-03T18:30:06Z

Sp3 does not support op_sel for dot4* or dot8* instructions on any of gfxip 908,90a,940,1030,11, or 12.
None of those ISA manuals say the instructions should support op_sel.
I don't see any docs on what the semantics of opsel on those instructions would be.
The only mention of opsel on these instructions in the programming guide is in GFX11 and GFX12. It says if you use inline constants they 'use 32bit inline src0/1 (ignore OPSEL)'. We use i32 fields for those operands in tablegen, so that looks fine as is.
We don't use op_sel in codegen on those instructions. So this will not be a functional change to codegen, only the asm/disasm (please correct me if I'm wrong on this).

However, @DadSchoorse reports some semantics on how op_sel might work on these instructions. Is that claim from experimentation on hardware, or something else?

It might be best to leave the assembler/disassembler as is, and make sure we continue to not use op_sel in codegen. If we do want to change these, please change them across all targets.

arsenm · 2024-10-03T19:26:47Z

I don't see any docs on what the semantics of opsel on those instructions would be.

I don't either. I only found a note that says "is no longer supported on DOT2/4/8" for 940. That implies it was previously supported

jwanggit86 · 2024-10-07T19:09:51Z

Here is a summary of my findings from the ISA docs:

v_dot4_, v_dot8_ use VOP3P encoding.
MI100 ISA: Sect 12.10 "VOP3P Instructions", only 3 V_MAD_MIX* instructions use OPSEL.
MI200 ISA: same as MI100.
MI300 ISA: same as MI200.
RDNA1 ISA: No v_dot4_, v_dot8_; 3 V_FMA_MIX* instruxtions use OPSEL.
RDNA2 ISA: 3 V_FMA_MIX* instruxtions use OPSEL.
RDNA3 ISA: (1) Sect 16.10 "VOP3P Instructions", 3 V_FMA_MIX* instruxtions use OPSEL.
(2) Sect 7.5.1 "Inline Constants with Packed Math", mentions "ignore OPSEL" for dot4_* and dot8_*. So apparently OPSEL
is allowed.

Based on this I'm ok with either of the following: (1) leave assembler/disassembler as is (2) disallow OPSEL for v_dot4_* and v_dot8_* for pre-GFX11 targets, as this patch does.

@arsenm @Sisyph Please let me know your thoughts.

DadSchoorse · 2024-10-07T19:48:07Z

However, @DadSchoorse reports some semantics on how op_sel might work on these instructions. Is that claim from experimentation on hardware, or something else?

From experimentation. For example on gfx10.3 and gfx11 hw behavior of v_dot4_u32_u8 is:

uint32_t v0x = (uint8_t)(src0 >> (opsel[0] * 16));
uint32_t v0y = (uint8_t)((src0 >> (opsel[0] * 16)) >>  8);
uint32_t v0z = (uint8_t)(src0 >> (opsel_hi[0] * 16));
uint32_t v0w = (uint8_t)((src0 >> (opsel_hi[0] * 16)) >> 8);
uint32_t v1x = (uint8_t)(src1 >> (opsel[1] * 16));
uint32_t v1y = (uint8_t)((src1 >> (opsel[1] * 16)) >>  8);
uint32_t v1z = (uint8_t)(src1 >> (opsel_hi[1] * 16));
uint32_t v1w = (uint8_t)((src1 >> (opsel_hi[1] * 16)) >> 8);
dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;

src2 opsel is ignored, src0/1 opsel works as a 16bit swizzle. (I have no gfx9 hw to test)

The RDNA3 doc note that opsel is ignored for inline constants is not correct:

v_mov_b32 v2 0x04030201
v_dot4_u32_u8 v0, 1, v2, 0 op_sel:[0,0,0] op_sel_hi:[1,1,1]
// v0 is 1
v_dot4_u32_u8 v0, 1, v2, 0 op_sel:[1,0,0] op_sel_hi:[1,1,1]
// v0 is 0
v_dot4_u32_u8 v0, 1, v2, 0 op_sel:[0,0,0] op_sel_hi:[0,1,1]
// v0 is 4
v_dot4_u32_u8 v0, 1, v2, 0 op_sel:[0,1,0] op_sel_hi:[1,1,1]
// v0 is 6

jwanggit86 · 2024-10-11T01:45:50Z

Here's some additional info garnered from the ISA docs.

In RDNA3 doc, Sect 7.1 "Microcode Encodings" there's a statement: "OPSEL may only be used for 16-bit operands, and must be zero for any other operands/results." Since v_dot4_* and v_dot8_* don't operate on 16-bit operands, I think OPSEL is allowed but must be 0.

In RDNA2 doc, Sect 6.2.5 "OPSEL Field Restrictions" states OPSEL is only usable in opcodes listed in Table 21, which doesn't include v_dot4_* or v_dot8_.

In MI300 doc, Sect 6.2.1 "Instruction Inputs" has a statement: "DOT instructions must not use SDWA or OPSEL.".

In MI200 doc, not explicit statements about OPSEL in Dot.

Based on the above, I'd suggest that for v_dot4_* and v_dot8_*, (1) OPSEL should be disallowed for MI300 and RDNA2 (2) OPSEL is allowed but must be 0 for RDNA3.

@arsenm @Sisyph @DadSchoorse Pls let me know your thoughts.

arsenm · 2024-10-11T16:52:18Z

We should run this by hardware. If I were to guess, this is yet another case where they didn't validate it works so did not document it as supported.

rampitec · 2024-10-11T18:31:03Z

I am reading SP3 pseudocode, looks like OPSEL is not used on any of the targets with this. These instructions also do not have OPSEL flag in SP3. So I guess we need to set it to zero on all targets.

jwanggit86 · 2024-10-21T19:21:17Z

In the existing code, for all dot instructions for GFX940, op_sel must be 0, and op_sel_hi cannot be included. This latest commits apply the same restrictions to VOP3P dot instructions (v_dot4_, v_dot8_, and some v_dot2_*).

jwanggit86 · 2024-10-30T17:43:18Z

@arsenm @rampitec ping.

jwanggit86 · 2024-11-06T22:16:20Z

@arsenm @rampitec ping.

rampitec · 2024-11-07T18:39:11Z

llvm/test/MC/AMDGPU/dl-insts.s

 //

 // CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x1c]
 v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa3,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,1]


This instruction supports opsel before gfx940.

Thanks! However, v_dot4_ and v_dot8_ still don't support opsel pre-gfx940, right? What's the best way to differentiate between v_dot2_ and, say v_dot4_?

Right, dot4 and dot8 do not have opsel. The difference is that these work on 4 and 8 component packed data and opsel can only select a word. Technically we are using V2F16 for v_dot2_f32_f16 in the profile and I32 for dot4 and dot8.

rampitec · 2024-11-08T20:54:14Z

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

-        return false;
+  if (TSFlags & SIInstrFlags::IsDOT) {
+    // For all DOT instructions on GFX940, or VOP3P DOT instructions on all
+    // targets, i.e. v_dot2_*(except on pre-GFX940), v_dot4_* and v_dot8_*,


I have checked gfx10 and DOT2 supports opsel, at least for 3 instructions below...

Ok, I'll look into this.

rampitec · 2024-11-08T21:36:08Z

Besides asm parser there is also codegen. Can we just select VOP3Features for dot4 and dot8 instructions to include VOP3_PACKED, but not VOP3_OPSEL? So that op_sel operands will not even exist on these. I guess it will need some encoding bits to be hardcoded in their Real definitions.

jwanggit86 · 2024-11-08T23:19:41Z

Besides asm parser there is also codegen. Can we just select VOP3Features for dot4 and dot8 instructions to include VOP3_PACKED, but not VOP3_OPSEL? So that op_sel operands will not even exist on these. I guess it will need some encoding bits to be hardcoded in their Real definitions.

The very 1st commit of the patch did just that. Then op_sel is disallowed for v_dot4_* and v_dot8_*. The 2nd commit then allows it for GFX11 and GFX12. The latest version allows it, but it must be 0. Should I go back to the 2nd commit?

jwanggit86 · 2024-12-11T19:26:31Z

@Sisyph @arsenm @rampitec So far two solutions have been tried: (1) disallow OPSEL for v_dot4_* and v_dot8_* for pre-GFX11 targets (2) allow OPSEL for all targets but its value must be 0. A 3rd option is not to make any fix and leave things as they are. Pls let me know your thoughts.

rampitec · 2024-12-11T20:47:19Z

@Sisyph @arsenm @rampitec So far two solutions have been tried: (1) disallow OPSEL for v_dot4_* and v_dot8_* for pre-GFX11 targets (2) allow OPSEL for all targets but its value must be 0. A 3rd option is not to make any fix and leave things as they are. Pls let me know your thoughts.

I would disallow it. Why pre-GFX11 only? It does not do anything for dot4 and dot8 on any target.

jwanggit86 · 2024-12-11T22:46:37Z

I would disallow it. Why pre-GFX11 only? It does not do anything for dot4 and dot8 on any target.

The reason for limiting this to pre-GFX11 is that Sect 7.5.1 of the RDNA3 ISA doc indicates "ignore OPSEL" for dot4_* and dot8_*, which seems to imply it's allowed. But I'm fine with disallowing it for all targets, which is actually what the 1st commit did.

rampitec · 2024-12-11T22:48:41Z

I would disallow it. Why pre-GFX11 only? It does not do anything for dot4 and dot8 on any target.

The reason for limiting this to pre-GFX11 is that Sect 7.5.1 of the RDNA3 ISA doc indicates "ignore OPSEL" for dot4_* and dot8_*, which seems to imply it's allowed. But I'm fine with disallowing it for all targets, which is actually what the 1st commit did.

Technically you can encode it, but it does not affect the execution. No reason to handle it differently between targets, just set encoding bits to default.

packed data In v_dot4 and v_dot8 instructions with 4- or 8-bit packed data (e.g., v_dot4_u32_u8, v_dot8_u32_u4), the op_sel modifier should not be allowed for GFX9 and GFX10. For GFX11 and GFX12, section 7.5.1 of the RDNS3 ISA doc seems to indicate that OPSEL is allowed.

Section 7.5.1 of the RDNA3 ISA doc apparently indicates that OPSEL is allowed (although ignored) in the dot instructions in question. Therefore, we should allow it for GFX11 and GFX12.

be 0 if present.

behavior, i.e., op_sel should be 0 for all DOT instructions, not just the VOP3P DOTs.

OPSEL in v_dot4_* and v_dot8_* instructions for all targets. This reverts commit 7ae70f0 and all later commits.

arsenm

This matches what the documentation states. If we really want to support undocumented usage, we should make a more conscious effort to test and validate it really works

rampitec

VOP3_PACKED_NO_OPSEL is a right thing to do.

reverts: breaks rocBLAS build d57230c [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (llvm#100485) Change-Id: I19e8a59504e111ac0eeca51f30a0fbe36c6b57d5

jwanggit86 added backend:AMDGPU mc Machine (object) code labels Jul 24, 2024

jwanggit86 requested a review from arsenm July 24, 2024 23:58

arsenm reviewed Jul 25, 2024

View reviewed changes

jwanggit86 force-pushed the disallow-opsel-in-dot-op-with-4-or-8-bit-packed-data branch 2 times, most recently from dff788f to 297b09b Compare October 2, 2024 20:26

jayfoad requested review from Sisyph and petar-avramovic October 3, 2024 08:13

arsenm requested review from jayfoad and rampitec October 11, 2024 16:52

jwanggit86 changed the title ~~[AMDGPU][MC] Disallow op_sel in v_dot4 and v_dot8 with 4- or 8-bit packed data~~ [AMDGPU][MC] Restrict op_sel in VOP3P dot instructions Oct 19, 2024

jwanggit86 requested a review from arsenm October 21, 2024 19:23

jwanggit86 force-pushed the disallow-opsel-in-dot-op-with-4-or-8-bit-packed-data branch from d2918ab to f7e9331 Compare October 24, 2024 23:35

rampitec reviewed Nov 7, 2024

View reviewed changes

rampitec reviewed Nov 8, 2024

View reviewed changes

jwanggit86 requested a review from rampitec November 11, 2024 18:16

jwanggit86 added 6 commits December 17, 2024 20:29

Allow OPSEL for GFX11 and GFX12.

8893ea2

Section 7.5.1 of the RDNA3 ISA doc apparently indicates that OPSEL is allowed (although ignored) in the dot instructions in question. Therefore, we should allow it for GFX11 and GFX12.

Allow op_sel for VOP3P DOT instructions, but value of op_sel must

f5d32b2

be 0 if present.

Fix a problem in previous commit. For GFX940 keep the original

82f0f7c

behavior, i.e., op_sel should be 0 for all DOT instructions, not just the VOP3P DOTs.

For pre-GFX940 targets, v_dot2_* allow opsel.

f1728c7

Revert to the 1st commit, 2753fd1, which disallows

dfde973

OPSEL in v_dot4_* and v_dot8_* instructions for all targets. This reverts commit 7ae70f0 and all later commits.

jwanggit86 force-pushed the disallow-opsel-in-dot-op-with-4-or-8-bit-packed-data branch from 4fc4ac6 to dfde973 Compare December 18, 2024 05:26

arsenm approved these changes Dec 18, 2024

View reviewed changes

rampitec approved these changes Dec 18, 2024

View reviewed changes

Sisyph approved these changes Dec 18, 2024

View reviewed changes

jwanggit86 changed the title ~~[AMDGPU][MC] Restrict op_sel in VOP3P dot instructions~~ [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions Dec 18, 2024

jwanggit86 merged commit d57230c into llvm:main Dec 18, 2024
8 checks passed

jwanggit86 linked an issue Dec 19, 2024 that may be closed by this pull request

[AMDGPU][MC][GFX908] dot opcodes operating 4-bit and 8-bit packed data should not support op_sel #44257

Closed

This was referenced Jun 2, 2025

[MTE] [NFC] use vector to collect globals to tag (#120283) #142329

Closed

[MTE] [NFC] use vector to collect globals to tag (#120283) #142330

Draft

[AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions #100485

[AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions #100485

Uh oh!

Conversation

jwanggit86 commented Jul 24, 2024

Uh oh!

llvmbot commented Jul 24, 2024

Uh oh!

llvmbot commented Jul 24, 2024

Uh oh!

arsenm left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

DadSchoorse commented Jul 25, 2024

Uh oh!

jwanggit86 commented Jul 25, 2024

Uh oh!

jwanggit86 commented Aug 2, 2024

Uh oh!

jwanggit86 commented Aug 12, 2024

Uh oh!

arsenm commented Aug 16, 2024

Uh oh!

jwanggit86 commented Aug 23, 2024

Uh oh!

jwanggit86 commented Sep 4, 2024

Uh oh!

Sisyph commented Oct 3, 2024

Uh oh!

arsenm commented Oct 3, 2024

Uh oh!

jwanggit86 commented Oct 7, 2024

Uh oh!

DadSchoorse commented Oct 7, 2024

Uh oh!

jwanggit86 commented Oct 11, 2024

Uh oh!

arsenm commented Oct 11, 2024

Uh oh!

rampitec commented Oct 11, 2024

Uh oh!

jwanggit86 commented Oct 21, 2024

Uh oh!

jwanggit86 commented Oct 30, 2024

Uh oh!

jwanggit86 commented Nov 6, 2024

Uh oh!

rampitec Nov 7, 2024

Choose a reason for hiding this comment

Uh oh!

jwanggit86 Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

rampitec Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

rampitec Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

jwanggit86 Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

rampitec commented Nov 8, 2024

Uh oh!

jwanggit86 commented Nov 8, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jwanggit86 commented Dec 11, 2024

Uh oh!

rampitec commented Dec 11, 2024

Uh oh!

jwanggit86 commented Dec 11, 2024

Uh oh!

rampitec commented Dec 11, 2024

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

rampitec left a comment

arsenm left a comment •

edited

Loading

jwanggit86 commented Nov 8, 2024 •

edited

Loading