-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AMDGPU] Add i1 mul patterns #67291
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Add i1 mul patterns #67291
Conversation
@llvm/pr-subscribers-backend-amdgpu Changesi1 muls can sometimes happen after SCEV. They resulted in ISel failures because we were missing the patterns for them. Solves SWDEV-423354 Patch is 27.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/67291.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 245808fc22a9c99..ed97137d5dedd53 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2345,6 +2345,11 @@ def : GCNPat <
(S_AND_B64 $src0, $src1)
>;
+def : GCNPat <
+ (i1 (mul i1:$src0, i1:$src1)),
+ (S_AND_B64 $src0, $src1)
+>;
+
def : GCNPat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B64 $src0, $src1)
@@ -2384,6 +2389,11 @@ def : GCNPat <
(S_AND_B32 $src0, $src1)
>;
+def : GCNPat <
+ (i1 (mul i1:$src0, i1:$src1)),
+ (S_AND_B32 $src0, $src1)
+>;
+
def : GCNPat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B32 $src0, $src1)
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b4e9376d8277737..a8973d845b12dae 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -1059,6 +1059,298 @@ entry:
ret void
}
+define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind {
+; SI-LABEL: s_mul_i1:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s2, s[0:1], 0x13
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dword s3, s[0:1], 0x1c
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitcmp1_b32 s2, 0
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: s_bitcmp1_b32 s3, 0
+; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_mul_i1:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dword s3, s[0:1], 0x70
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bitcmp1_b32 s2, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_bitcmp1_b32 s3, 0
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_mul_i1:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_bitcmp1_b32 s2, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_bitcmp1_b32 s3, 0
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_mul_i1:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c
+; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_bitcmp1_b32 s2, 0
+; GFX10-NEXT: s_cselect_b32 s0, -1, 0
+; GFX10-NEXT: s_bitcmp1_b32 s3, 0
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
+; GFX10-NEXT: s_and_b32 s0, s0, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_mul_i1:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
+; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitcmp1_b32 s2, 0
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_bitcmp1_b32 s3, 0
+; GFX11-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s2, s3
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; EG-LABEL: s_mul_i1:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @10, KC0[], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PS, 1,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %mul = mul i1 %a, %b
+ store i1 %mul, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: v_mul_i1:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
+; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v0, 1, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 1, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
+; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_mul_i1:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s2
+; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_and_b32_e32 v0, 1, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v1, 1, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
+; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_mul_i1:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s10, s6
+; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
+; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_mul_i1:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
+; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; GFX10-NEXT: s_mov_b32 s1, s5
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
+; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: s_mov_b32 s0, s4
+; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_mul_i1:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; EG-LABEL: v_mul_i1:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 4, #1
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PS, 1,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+ %a = load i1, ptr addrspace(1) %in
+ %b = load i1, ptr addrspace(1) %b_ptr
+ %result = mul i1 %a, %b
+ store i1 %result, ptr addrspace(1) %out
+ ret void
+}
+
; A standard 64-bit multiply. The expansion should be around 6 instructions.
; It would be difficult to match the expansion correctly without writing
; a really complicated list of FileCheck expressions. I don't want
@@ -1213,7 +1505,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; SI-NEXT: v_mul_hi_u32 v4, v2, v0
; SI-NEXT: v_mul_lo_u32 v3, v3, v0
; SI-NEXT: v_mul_lo_u32 v0, v2, v0
-; SI-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -1367,30 +1659,30 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s2, 0
-; SI-NEXT: s_cbranch_scc0 .LBB11_2
+; SI-NEXT: s_cbranch_scc0 .LBB13_2
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mul_i32 s6, s2, s3
; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: s_branch .LBB11_3
-; SI-NEXT: .LBB11_2:
+; SI-NEXT: s_branch .LBB13_3
+; SI-NEXT: .LBB13_2:
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: .LBB11_3: ; %Flow
+; SI-NEXT: .LBB13_3: ; %Flow
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB11_5
+; SI-NEXT: s_cbranch_vccnz .LBB13_5
; SI-NEXT: ; %bb.4: ; %if
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; SI-NEXT: s_branch .LBB11_6
-; SI-NEXT: .LBB11_5:
+; SI-NEXT: s_branch .LBB13_6
+; SI-NEXT: .LBB13_5:
; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: .LBB11_6: ; %endif
+; SI-NEXT: .LBB13_6: ; %endif
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1402,18 +1694,18 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cbranch_scc0 .LBB11_2
+; VI-NEXT: s_cbranch_scc0 .LBB13_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mul_i32 s6, s2, s3
; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: s_branch .LBB11_3
-; VI-NEXT: .LBB11_2:
+; VI-NEXT: s_branch .LBB13_3
+; VI-NEXT: .LBB13_2:
; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: ; implicit-def: $sgpr6
-; VI-NEXT: .LBB11_3: ; %Flow
+; VI-NEXT: .LBB13_3: ; %Flow
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT: s_cbranch_vccnz .LBB11_5
+; VI-NEXT: s_cbranch_vccnz .LBB13_5
; VI-NEXT: ; %bb.4: ; %if
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
@@ -1421,10 +1713,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT: s_branch .LBB11_6
-; VI-NEXT: .LBB11_5:
+; VI-NEXT: s_branch .LBB13_6
+; VI-NEXT: .LBB13_5:
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: .LBB11_6: ; %endif
+; VI-NEXT: .LBB13_6: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
@@ -1437,18 +1729,18 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cbranch_scc0 .LBB11_2
+; GFX9-NEXT: s_cbranch_scc0 .LBB13_2
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_mul_i32 s6, s2, s3
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: s_branch .LBB11_3
-; GFX9-NEXT: .LBB11_2:
+; GFX9-NEXT: s_branch .LBB13_3
+; GFX9-NEXT: .LBB13_2:
; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: ; implicit-def: $sgpr6
-; GFX9-NEXT: .LBB11_3: ; %Flow
+; GFX9-NEXT: .LBB13_3: ; %Flow
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_vccnz .LBB11_5
+; GFX9-NEXT: s_cbranch_vccnz .LBB13_5
; GFX9-NEXT: ; %bb.4: ; %if
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
@@ -1456,10 +1748,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; GFX9-NEXT: s_branch .LBB11_6
-; GFX9-NEXT: .LBB11_5:
+; GFX9-NEXT: s_branch .LBB13_6
+; GFX9-NEXT: .LBB13_5:
; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: .LBB11_6: ; %endif
+; GFX9-NEXT: .LBB13_6: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -1473,17 +1765,17 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-NEXT: s_cbranch_scc0 .LBB11_2
+; GFX10-NEXT: s_cbranch_scc0 .LBB13_2
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_mul_i32 s5, s2, s3
-; GFX10-NEXT: s_branch .LBB11_3
-; GFX10-NEXT: .LBB11_2:
+; GFX10-NEXT: s_branch .LBB13_3
+; GFX10-NEXT: .LBB13_2:
; GFX10-NEXT: s_mov_b32 s4, -1
; GFX10-NEXT: ; implicit-def: $sgpr5
-; GFX10-NEXT: .LBB11_3: ; %Flow
+; GFX10-NEXT: .LBB13_3: ; %Flow
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_vccnz .LBB11_5
+; GFX10-NEXT: s_cbranch_vccnz .LBB13_5
; GFX10-NEXT: ; %bb.4: ; %if
; GFX10-NEXT: s_mov_b32 s7, 0x31016000
; GFX10-NEXT: s_mov_b32 s6, -1
@@ -1491,10 +1783,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; GFX10-NEXT: s_branch .LBB11_6
-; GFX10-NEXT: .LBB11_5:
+; GFX10-NEXT: s_branch .LBB13_6
+; GFX10-NEXT: .LBB13_5:
; GFX10-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-NEXT: .LBB11_6: ; %endif
+; GFX10-NEXT: .LBB13_6: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-NEXT: s_mov_b32 s2, -1
@@ -1508,17 +1800,17 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB11_2
+; GFX11-NEXT: s_cbranch_scc0 .LBB13_2
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_mul_i32 s5, s2, s3
-; GFX11-NEXT: s_branch .LBB11_3
-; GFX11-NEXT: .LBB11_2:
+; GFX11-NEXT: s_branch .LBB13_3
+; GFX11-NEXT: .LBB13_2:
; GFX11-NEXT: s_mov_b32 s4, -1
; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: .LBB11_3: ; %Flow
+; GFX11-NEXT: .LBB13_3: ; %Flow
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB11_5
+; GFX11-NEXT: s_cbranch_vccnz .LBB13_5
; GFX11-NEXT: ; %bb.4: ; %if
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -1526,10 +1818,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_mov_b32 s4, s2
; GFX11-NEXT: s_mov_b32 s5, s3
; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_branch .LBB11_6
-; GFX11-NEXT: .LBB11_5:
+; GFX11-NEXT: s_branch .LBB13_6
+; GFX11-NEXT: .LBB13_5:
; GFX11-NEXT: v_mov_b32_e32 v0, s5
-; GFX11-NEXT: .LBB11_6: ; %endif
+; GFX11-NEXT: .LBB13_6: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
@@ -1601,7 +1893,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be better off legalizing these rather than selecting
We already have identical patterns for other operations like add, what makes |
Ping |
I agree. Why are i1 add/mul legal in the first place? |
i1 muls can sometimes happen after SCEV. They resulted in ISel failures because we were missing the patterns for them. Solves SWDEV-423354
I legalized the i1 instead, leads to better codegen too it seems. |
That would make sense to me, but I don't know the history. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Next time please remember to update the desciption when you rewrite the patch! |
Sorry, I completely forgot. Should I revert & reland with a better name? |
No problem, it happens. There is no need to revert the patch. |
i1 muls can sometimes happen after SCEV. They resulted in ISel failures because we were missing the patterns for them. Solves SWDEV-423354 Change-Id: I84457f4c2b2880d657738e91ae68871143f34a48
i1 muls can sometimes happen after SCEV. They resulted in ISel failures because we were missing the patterns for them. Solves SWDEV-423354 Change-Id: I84457f4c2b2880d657738e91ae68871143f34a48
i1 muls can sometimes happen after SCEV. They resulted in ISel failures because we were missing the patterns for them.
Solves SWDEV-423354