Skip to content

[AMDGPU] Enable atomic optimizer for divergent i64 and double values #96934

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jul 15, 2024
41 changes: 30 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
return Changed;
}

static bool isLegalCrossLaneType(Type *Ty) {
switch (Ty->getTypeID()) {
case Type::FloatTyID:
case Type::DoubleTyID:
return true;
case Type::IntegerTyID: {
unsigned Size = Ty->getIntegerBitWidth();
return (Size == 32 || Size == 64);
}
default:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't forget pointers. In a follow up the should really just handle half / bfloat and vectors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel pointers should be handled as a follow up too since I intend this patch to reflect current requirements (changed the title since it was misleading)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also enabling for half, bfloat etc would require additional legalization support for intrinsics such as update.dpp , set.incactive.lane ....

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought those all supported 16-bit values already

return false;
}
}

void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// Early exit for unhandled address space atomic instructions.
switch (I.getPointerAddressSpace()) {
Expand Down Expand Up @@ -228,11 +242,14 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {

// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
if (ValDivergent &&
(!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
return;
// we have DPP available on our subtarget (for DPP strategy), and the atomic
// operation is 32 or 64 bits.
if (ValDivergent) {
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
return;

if (!isLegalCrossLaneType(I.getType()))
return;
}

// If we get here, we can optimize the atomic using a single wavefront-wide
Expand Down Expand Up @@ -311,11 +328,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {

// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
if (ValDivergent &&
(!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
return;
// we have DPP available on our subtarget (for DPP strategy), and the atomic
// operation is 32 or 64 bits.
if (ValDivergent) {
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
return;

if (!isLegalCrossLaneType(I.getType()))
return;
}

// If any of the other arguments to the intrinsic are divergent, we can't
Expand Down Expand Up @@ -748,7 +768,6 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// of each active lane in the wavefront. This will be our new value
// which we will provide to the atomic operation.
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
assert(TyBitWidth == 32);
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
{NewV, LastLaneIdx});
}
Expand Down
1,374 changes: 1,186 additions & 188 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll

Large diffs are not rendered by default.

115 changes: 104 additions & 11 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -574,13 +574,44 @@ entry:
define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
; GFX6-LABEL: add_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: ; implicit-def: $vgpr1
; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX6-NEXT: s_mov_b32 m0, s5
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
; GFX6-NEXT: s_add_i32 s4, s4, s8
; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr0
; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX6-NEXT: s_cbranch_execz .LBB2_4
; GFX6-NEXT: ; %bb.3:
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB2_4:
; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
Expand Down Expand Up @@ -937,15 +968,46 @@ entry:
define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) {
; GFX6-LABEL: struct_add_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s8, s[2:3], 0x11
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: ; implicit-def: $vgpr1
; GFX6-NEXT: .LBB3_1: ; %ComputeLoop
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX6-NEXT: s_mov_b32 m0, s5
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
; GFX6-NEXT: s_add_i32 s4, s4, s8
; GFX6-NEXT: s_cbranch_vccnz .LBB3_1
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr0
; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX6-NEXT: s_cbranch_execz .LBB3_4
; GFX6-NEXT: ; %bb.3:
; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s8
; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
; GFX6-NEXT: v_mov_b32_e32 v2, s5
; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX6-NEXT: .LBB3_4:
; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
Expand Down Expand Up @@ -2011,13 +2073,44 @@ entry:
define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
; GFX6-LABEL: sub_i32_varying_vdata:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: ; implicit-def: $vgpr1
; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX6-NEXT: s_mov_b32 m0, s5
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
; GFX6-NEXT: s_add_i32 s4, s4, s8
; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr0
; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX6-NEXT: s_cbranch_execz .LBB7_4
; GFX6-NEXT: ; %bb.3:
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB7_4:
; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
Expand Down
Loading