-
Notifications
You must be signed in to change notification settings - Fork 14.1k
[AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions #100485
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions #100485
Conversation
@llvm/pr-subscribers-mc Author: Jun Wang (jwanggit86) ChangesIn v_dot4 and v_dot8 instructions with 4- or 8-bit packed data (e.g., v_dot4_u32_u8, v_dot8_u32_u4), the op_sel modifier should not be included. Patch is 100.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100485.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f4d2c29158f49..942f1dc251766 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -384,19 +384,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
let OtherPredicates = [HasDot7Insts] in {
let IsInvalidSingleUseConsumer = 1 in {
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot4, 1>;
}
defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot8, 1>;
} // End OtherPredicates = [HasDot7Insts]
let OtherPredicates = [HasDot1Insts] in {
let IsInvalidSingleUseConsumer = 1 in {
defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot4, 1>;
}
defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot8, 1>;
} // End OtherPredicates = [HasDot1Insts]
def DOT2_BF16_Profile
@@ -415,7 +415,7 @@ defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile,
multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
let IsDOT = 1 in
- defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>,
+ defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>,
null_frag, 1>;
// Dot-iu instructions consider input as signed if imod neg bits are set. Thus
// Dot-iu Intrinsics have extra operands and require separate codegen pattern.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f2ed17ac305a1..26bf6d6ff7b89 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1253,11 +1253,12 @@ class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
bit IsMAI = MAI;
}
-def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
-def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
-def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
-def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
-def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
+def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
+def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
+def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
+def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
+def VOP3_PACKED_NO_OPSEL : VOP3Features<1, 0, 1, 0>;
+def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 8be7308c8a6e0..3feccff715bc1 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -746,7 +746,7 @@ name: smfma4x4_write_vgpr_dot_write
body: |
bb.0:
$vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
- $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: smfma4x4_read_srcc_vgpr_valu_write
# GCN: V_MFMA
@@ -945,7 +945,7 @@ name: dot_write_vgpr_different_dot_read_srcc
body: |
bb.0:
$vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
- $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dot_write_vgpr_different_dot_write
# GCN: V_DOT
@@ -955,7 +955,7 @@ name: dot_write_vgpr_different_dot_write
body: |
bb.0:
$vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
- $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dot_write_vgpr_different_valu_read
# GCN: V_DOT
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index a98b02d792d98..9fb6a7b6b5e55 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -1014,7 +1014,7 @@ name: xdl_smfma4x4_write_vgpr_dot_write
body: |
bb.0:
$vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
- $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: nonxdl_smfma4x4_read_srcc_vgpr_valu_write
# GCN: V_MFMA
@@ -1208,7 +1208,7 @@ name: dot_write_vgpr_different_dot_read_srcc
body: |
bb.0:
$vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
- $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dot_write_vgpr_different_dot_write
# GCN: V_DOT
@@ -1218,7 +1218,7 @@ name: dot_write_vgpr_different_dot_write
body: |
bb.0:
$vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
- $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dot_write_vgpr_different_valu_read
# GCN: V_DOT
diff --git a/llvm/test/MC/AMDGPU/dl-insts.s b/llvm/test/MC/AMDGPU/dl-insts.s
index 00e9bec7eb0a2..599734aac829d 100644
--- a/llvm/test/MC/AMDGPU/dl-insts.s
+++ b/llvm/test/MC/AMDGPU/dl-insts.s
@@ -536,198 +536,6 @@ v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
// CHECK: encoding: [0x00,0x18,0xa7,0xd3,0x01,0x05,0x0e,0x1c]
v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
//
// Test clamp.
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index 51498d3c86d7f..87a09875f75e9 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -211,3 +211,365 @@ image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12
// missing dim
image_msaa_load v[1:4], v[5:7], s[8:15] dmask:0xf glc
// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+// op_sel not allowed in dot opcodes with 4- or 8-bit packed data
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Jun Wang (jwanggit86) ChangesIn v_dot4 and v_dot8 instructions with 4- or 8-bit packed data (e.g., v_dot4_u32_u8, v_dot8_u32_u4), the op_sel modifier should not be included. Patch is 100.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100485.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f4d2c29158f49..942f1dc251766 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -384,19 +384,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
let OtherPredicates = [HasDot7Insts] in {
let IsInvalidSingleUseConsumer = 1 in {
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot4, 1>;
}
defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_udot8, 1>;
} // End OtherPredicates = [HasDot7Insts]
let OtherPredicates = [HasDot1Insts] in {
let IsInvalidSingleUseConsumer = 1 in {
defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot4, 1>;
}
defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>, int_amdgcn_sdot8, 1>;
} // End OtherPredicates = [HasDot1Insts]
def DOT2_BF16_Profile
@@ -415,7 +415,7 @@ defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile,
multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
let IsDOT = 1 in
- defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>,
+ defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>,
null_frag, 1>;
// Dot-iu instructions consider input as signed if imod neg bits are set. Thus
// Dot-iu Intrinsics have extra operands and require separate codegen pattern.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f2ed17ac305a1..26bf6d6ff7b89 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1253,11 +1253,12 @@ class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
bit IsMAI = MAI;
}
-def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
-def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
-def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
-def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
-def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
+def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
+def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
+def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
+def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
+def VOP3_PACKED_NO_OPSEL : VOP3Features<1, 0, 1, 0>;
+def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
index 8be7308c8a6e0..3feccff715bc1 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -746,7 +746,7 @@ name: smfma4x4_write_vgpr_dot_write
body: |
bb.0:
$vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
- $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: smfma4x4_read_srcc_vgpr_valu_write
# GCN: V_MFMA
@@ -945,7 +945,7 @@ name: dot_write_vgpr_different_dot_read_srcc
body: |
bb.0:
$vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
- $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dot_write_vgpr_different_dot_write
# GCN: V_DOT
@@ -955,7 +955,7 @@ name: dot_write_vgpr_different_dot_write
body: |
bb.0:
$vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
- $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dot_write_vgpr_different_valu_read
# GCN: V_DOT
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
index a98b02d792d98..9fb6a7b6b5e55 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir
@@ -1014,7 +1014,7 @@ name: xdl_smfma4x4_write_vgpr_dot_write
body: |
bb.0:
$vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
- $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: nonxdl_smfma4x4_read_srcc_vgpr_valu_write
# GCN: V_MFMA
@@ -1208,7 +1208,7 @@ name: dot_write_vgpr_different_dot_read_srcc
body: |
bb.0:
$vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
- $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dot_write_vgpr_different_dot_write
# GCN: V_DOT
@@ -1218,7 +1218,7 @@ name: dot_write_vgpr_different_dot_write
body: |
bb.0:
$vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
- $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dot_write_vgpr_different_valu_read
# GCN: V_DOT
diff --git a/llvm/test/MC/AMDGPU/dl-insts.s b/llvm/test/MC/AMDGPU/dl-insts.s
index 00e9bec7eb0a2..599734aac829d 100644
--- a/llvm/test/MC/AMDGPU/dl-insts.s
+++ b/llvm/test/MC/AMDGPU/dl-insts.s
@@ -536,198 +536,6 @@ v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
// CHECK: encoding: [0x00,0x18,0xa7,0xd3,0x01,0x05,0x0e,0x1c]
v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xa8,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x04]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x14]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xa9,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xaa,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x40,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0]
-// CHECK: encoding: [0x00,0x50,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1]
-// CHECK: encoding: [0x00,0x48,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0]
-// CHECK: encoding: [0x00,0x58,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x10,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x08,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x04]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x14]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x0c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
-// CHECK: encoding: [0x00,0x18,0xab,0xd3,0x01,0x05,0x0e,0x1c]
-v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
//
// Test clamp.
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index 51498d3c86d7f..87a09875f75e9 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -211,3 +211,365 @@ image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12
// missing dim
image_msaa_load v[1:4], v[5:7], s[8:15] dmask:0xf glc
// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+// op_sel not allowed in dot opcodes with 4- or 8-bit packed data
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe this is only true for gfx940, and I think the older targets did support op_sel with these. I see a note that not supporting op_sel is a change, but don't see documentation for what it did before
op_sel/op_sel_hi on src0/1 works the same as for other packed instructions on gfx10-11, it selects which half of the source is read. src2 op_sel is ignored. |
BTW, this is for issue 44257. |
These instructions (v_dot4, v_dot8) have fixed ways of using the 4-bit/8-bit data, so op_sel isn't really meaningful? |
@arsenm AMDGPUInstructionSelector::selectVOP3PModsImpl(
Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
...
(void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
}
class GCNSubtarget {
...
// Cannot use op_sel with v_dot instructions.
bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
} Even these two places are inconsistent. One says "do not use on gfx940+", another seems to apply only to gfx940 and gfx11. Second, are you suggesting op_sel should be disallowed for gfx940+, but left alone for older targets? |
No, I'm reading the ISA documentation, not the implementation
This looks like it wasn't cleanly implemented to track which targets support it and which don't. I wouldn't trust whatever is implemented here, this requires double checking the hardware docs
It should be left alone on any target where it's supported. We shouldn't just drop it everywhere |
@arsenm I checked the documentation. For gfx940 it is explicitly stated that op_sel is not allowed. However, it is vague (i.e., not explicitly said) for other targets. So, should we just disallow it for gfx940 for now? |
@arsenm ping. |
dff788f
to
297b09b
Compare
Sp3 does not support op_sel for dot4* or dot8* instructions on any of gfxip 908,90a,940,1030,11, or 12. However, @DadSchoorse reports some semantics on how op_sel might work on these instructions. Is that claim from experimentation on hardware, or something else? It might be best to leave the assembler/disassembler as is, and make sure we continue to not use op_sel in codegen. If we do want to change these, please change them across all targets. |
I don't either. I only found a note that says "is no longer supported on DOT2/4/8" for 940. That implies it was previously supported |
Here is a summary of my findings from the ISA docs:
Based on this I'm ok with either of the following: (1) leave assembler/disassembler as is (2) disallow OPSEL for v_dot4_* and v_dot8_* for pre-GFX11 targets, as this patch does. |
From experimentation. For example on gfx10.3 and gfx11 hw behavior of
src2 opsel is ignored, src0/1 opsel works as a 16bit swizzle. (I have no gfx9 hw to test) The RDNA3 doc note that opsel is ignored for inline constants is not correct:
|
Here's some additional info garnered from the ISA docs. In RDNA3 doc, Sect 7.1 "Microcode Encodings" there's a statement: "OPSEL may only be used for 16-bit operands, and must be zero for any other operands/results." Since v_dot4_* and v_dot8_* don't operate on 16-bit operands, I think OPSEL is allowed but must be 0. In RDNA2 doc, Sect 6.2.5 "OPSEL Field Restrictions" states OPSEL is only usable in opcodes listed in Table 21, which doesn't include v_dot4_* or v_dot8_. In MI300 doc, Sect 6.2.1 "Instruction Inputs" has a statement: "DOT instructions must not use SDWA or OPSEL.". In MI200 doc, not explicit statements about OPSEL in Dot. Based on the above, I'd suggest that for v_dot4_* and v_dot8_*, (1) OPSEL should be disallowed for MI300 and RDNA2 (2) OPSEL is allowed but must be 0 for RDNA3. @arsenm @Sisyph @DadSchoorse Pls let me know your thoughts. |
We should run this by hardware. If I were to guess, this is yet another case where they didn't validate it works so did not document it as supported. |
I am reading SP3 pseudocode, looks like OPSEL is not used on any of the targets with this. These instructions also do not have OPSEL flag in SP3. So I guess we need to set it to zero on all targets. |
In the existing code, for all dot instructions for GFX940, op_sel must be 0, and op_sel_hi cannot be included. This latest commits apply the same restrictions to VOP3P dot instructions (v_dot4_, v_dot8_, and some v_dot2_*). |
d2918ab
to
f7e9331
Compare
1 similar comment
llvm/test/MC/AMDGPU/dl-insts.s
Outdated
// | ||
|
||
// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x1c] | ||
v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,0] | ||
// CHECK: encoding: [0x00,0x50,0xa3,0xd3,0x01,0x05,0x0e,0x1c] | ||
v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This instruction supports opsel before gfx940.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! However, v_dot4_
and v_dot8_
still don't support opsel pre-gfx940, right? What's the best way to differentiate between v_dot2_
and, say v_dot4_
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, dot4 and dot8 do not have opsel. The difference is that these work on 4 and 8 component packed data and opsel can only select a word. Technically we are using V2F16 for v_dot2_f32_f16 in the profile and I32 for dot4 and dot8.
return false; | ||
if (TSFlags & SIInstrFlags::IsDOT) { | ||
// For all DOT instructions on GFX940, or VOP3P DOT instructions on all | ||
// targets, i.e. v_dot2_*(except on pre-GFX940), v_dot4_* and v_dot8_*, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have checked gfx10 and DOT2 supports opsel, at least for 3 instructions below...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I'll look into this.
Besides asm parser there is also codegen. Can we just select VOP3Features for dot4 and dot8 instructions to include VOP3_PACKED, but not VOP3_OPSEL? So that op_sel operands will not even exist on these. I guess it will need some encoding bits to be hardcoded in their Real definitions. |
The very 1st commit of the patch did just that. Then op_sel is disallowed for v_dot4_* and v_dot8_*. The 2nd commit then allows it for GFX11 and GFX12. The latest version allows it, but it must be 0. Should I go back to the 2nd commit? |
I would disallow it. Why pre-GFX11 only? It does not do anything for dot4 and dot8 on any target. |
The reason for limiting this to pre-GFX11 is that Sect 7.5.1 of the RDNA3 ISA doc indicates "ignore OPSEL" for dot4_* and dot8_*, which seems to imply it's allowed. But I'm fine with disallowing it for all targets, which is actually what the 1st commit did. |
Technically you can encode it, but it does not affect the execution. No reason to handle it differently between targets, just set encoding bits to default. |
packed data In v_dot4 and v_dot8 instructions with 4- or 8-bit packed data (e.g., v_dot4_u32_u8, v_dot8_u32_u4), the op_sel modifier should not be allowed for GFX9 and GFX10. For GFX11 and GFX12, section 7.5.1 of the RDNS3 ISA doc seems to indicate that OPSEL is allowed.
Section 7.5.1 of the RDNA3 ISA doc apparently indicates that OPSEL is allowed (although ignored) in the dot instructions in question. Therefore, we should allow it for GFX11 and GFX12.
be 0 if present.
behavior, i.e., op_sel should be 0 for all DOT instructions, not just the VOP3P DOTs.
OPSEL in v_dot4_* and v_dot8_* instructions for all targets. This reverts commit 7ae70f0 and all later commits.
4fc4ac6
to
dfde973
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This matches what the documentation states. If we really want to support undocumented usage, we should make a more conscious effort to test and validate it really works
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VOP3_PACKED_NO_OPSEL is a right thing to do.
reverts: breaks rocBLAS build d57230c [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (llvm#100485) Change-Id: I19e8a59504e111ac0eeca51f30a0fbe36c6b57d5
In v_dot4 and v_dot8 instructions with 4- or 8-bit packed data (e.g., v_dot4_u32_u8, v_dot8_u32_u4), the op_sel modifier should not be included.