From e9a981c202f17aaea84a87e6ba370d746f6f589e Mon Sep 17 00:00:00 2001 From: Vikram Date: Thu, 27 Jun 2024 06:15:54 +0000 Subject: [PATCH 1/2] [AMDGPU] Cleanup bitcast spam in atomic optimizer --- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 107 +- .../global-atomic-fadd.f32-no-rtn.ll | 23 +- .../GlobalISel/global-atomic-fadd.f32-rtn.ll | 39 +- .../atomic_optimizations_global_pointer.ll | 112 +- .../atomic_optimizations_local_pointer.ll | 160 +-- .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll | 33 +- .../AMDGPU/global-atomic-fadd.f32-rtn.ll | 29 +- .../AMDGPU/global_atomic_optimizer_fp_rtn.ll | 1168 +++++++---------- .../global_atomics_iterative_scan_fp.ll | 146 +-- .../global_atomics_optimizer_fp_no_rtn.ll | 474 +++---- .../AMDGPU/global_atomics_scan_fadd.ll | 170 +-- .../AMDGPU/global_atomics_scan_fmax.ll | 114 +- .../AMDGPU/global_atomics_scan_fmin.ll | 114 +- .../AMDGPU/global_atomics_scan_fsub.ll | 170 +-- 14 files changed, 1296 insertions(+), 1563 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 8062bc13f9a93..e3419a73e50fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -386,7 +386,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, Value *V, Value *const Identity) const { Type *AtomicTy = V->getType(); - Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); @@ -402,34 +401,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Reduce within each pair of rows (i.e. 32 lanes). assert(ST->hasPermLaneX16()); - V = B.CreateBitCast(V, IntNTy); Value *Permlanex16Call = B.CreateIntrinsic( V->getType(), Intrinsic::amdgcn_permlanex16, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); - V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), - B.CreateBitCast(Permlanex16Call, AtomicTy)); + V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call); if (ST->isWave32()) { return V; } if (ST->hasPermLane64()) { // Reduce across the upper and lower 32 lanes. - V = B.CreateBitCast(V, IntNTy); Value *Permlane64Call = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); - return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), - B.CreateBitCast(Permlane64Call, AtomicTy)); + return buildNonAtomicBinOp(B, Op, V, Permlane64Call); } // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); - V = B.CreateBitCast(V, IntNTy); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); - return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy), - B.CreateBitCast(Lane32, AtomicTy)); + return buildNonAtomicBinOp(B, Op, Lane0, Lane32); } // Use the builder to create an inclusive scan of V across the wavefront, with @@ -438,8 +431,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *Identity) const { Type *AtomicTy = V->getType(); - Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); - Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); @@ -470,20 +461,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes // 48..63). assert(ST->hasPermLaneX16()); - V = B.CreateBitCast(V, IntNTy); Value *PermX = B.CreateIntrinsic( V->getType(), Intrinsic::amdgcn_permlanex16, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); - Value *UpdateDPPCall = - B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy), - B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa), - B.getInt32(0xf), B.getFalse()}); - V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); + Value *UpdateDPPCall = B.CreateCall( + UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}); + V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); if (!ST->isWave32()) { // Combine lane 31 into lanes 32..63. - V = B.CreateBitCast(V, IntNTy); Value *const Lane31 = B.CreateIntrinsic( V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); @@ -491,8 +479,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); - V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), - UpdateDPPCall); + V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); } } return V; @@ -503,8 +490,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, Value *Identity) const { Type *AtomicTy = V->getType(); - Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); - Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); @@ -514,10 +499,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { - Function *ReadLane = Intrinsic::getDeclaration( - M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); - Function *WriteLane = Intrinsic::getDeclaration( - M, Intrinsic::amdgcn_writelane, B.getInt32Ty()); + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); + Function *WriteLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy); // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. @@ -527,24 +512,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); // Copy the old lane 15 to the new lane 16. - V = B.CreateCall( - WriteLane, - {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}), - B.getInt32(16), B.CreateBitCast(V, IntNTy)}); - V = B.CreateBitCast(V, AtomicTy); + V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), + B.getInt32(16), V}); + if (!ST->isWave32()) { // Copy the old lane 31 to the new lane 32. - V = B.CreateBitCast(V, IntNTy); - V = B.CreateCall(WriteLane, - {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), - B.getInt32(31)}), - B.getInt32(32), V}); + V = B.CreateCall( + WriteLane, + {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); // Copy the old lane 47 to the new lane 48. V = B.CreateCall( WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); - V = B.CreateBitCast(V, AtomicTy); } } @@ -584,24 +564,18 @@ std::pair AMDGPUAtomicOptimizerImpl::buildScanIteratively( auto *FF1 = B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); - Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); - auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy); + auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty()); // Get the value required for atomic operation - V = B.CreateBitCast(V, IntNTy); Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane, {V, LaneIdxInt}); - LaneValue = B.CreateBitCast(LaneValue, Ty); // Perform writelane if intermediate scan results are required later in the // kernel computations Value *OldValue = nullptr; if (NeedResult) { - OldValue = - B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane, - {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt, - B.CreateBitCast(OldValuePhi, IntNTy)}); - OldValue = B.CreateBitCast(OldValue, Ty); + OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane, + {Accumulator, LaneIdxInt, OldValuePhi}); OldValuePhi->addIncoming(OldValue, ComputeLoop); } @@ -700,10 +674,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, Type *const Ty = I.getType(); Type *Int32Ty = B.getInt32Ty(); - Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); - auto *const VecTy = FixedVectorType::get(Int32Ty, 2); // This is the value in the atomic operation we need to combine in order to // reduce the number of atomic operations. @@ -758,13 +730,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, if (ScanImpl == ScanOptions::DPP) { // First we need to set all inactive invocations to the identity value, so // that they can correctly contribute to the final result. - V = B.CreateBitCast(V, IntNTy); - Identity = B.CreateBitCast(Identity, IntNTy); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy, - {V, Identity}); - NewV = B.CreateBitCast(NewV, Ty); - V = B.CreateBitCast(V, Ty); - Identity = B.CreateBitCast(Identity, Ty); + NewV = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); if (!NeedResult && ST->hasPermLaneX16()) { // On GFX10 the permlanex16 instruction helps us build a reduction // without too many readlanes and writelanes, which are generally bad @@ -779,10 +746,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // which we will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); assert(TyBitWidth == 32); - NewV = B.CreateBitCast(NewV, IntNTy); - NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane, + NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, {NewV, LastLaneIdx}); - NewV = B.CreateBitCast(NewV, Ty); } // Finally mark the readlanes in the WWM section. NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); @@ -922,26 +887,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // but have to handle 64-bit broadcasts with two calls to this intrinsic. Value *BroadcastI = nullptr; - if (TyBitWidth == 64) { - Value *CastedPhi = B.CreateBitCast(PHI, IntNTy); - Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty); - CallInst *const ReadFirstLaneLo = B.CreateIntrinsic( - Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo); - CallInst *const ReadFirstLaneHi = B.CreateIntrinsic( - Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi); - Value *const PartialInsert = B.CreateInsertElement( - PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); - BroadcastI = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { - Value *CastedPhi = B.CreateBitCast(PHI, IntNTy); - BroadcastI = - B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi); - BroadcastI = B.CreateBitCast(BroadcastI, Ty); - + if (TyBitWidth == 32 || TyBitWidth == 64) { + BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI); } else { llvm_unreachable("Unhandled atomic bit width"); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 21832dc320e42..3f0b86c271538 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -169,30 +169,29 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_4]] + ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] ; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] ; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec ; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] @@ -200,7 +199,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_BRANCH %bb.3 ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35): + ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31): ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -211,7 +210,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37): + ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33): ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index e48d281f37c9a..676eae1bad85d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -171,35 +171,34 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec - ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec - ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_3]], 0, [[S_MOV_B32_3]], [[V_ADD_F32_e64_3]], 0, implicit $exec - ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec - ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 15 - ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_4]] - ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_5]], [[V_MOV_B32_dpp5]] - ; GFX11-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 31 - ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_6]] + ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15 + ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_3]] + ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]] + ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31 + ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_5]] ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]] ; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY13]], implicit $exec ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -207,7 +206,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.3 (%ir-block.36): + ; GFX11-NEXT: bb.3 (%ir-block.29): ; GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -217,11 +216,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: bb.4.Flow: ; GFX11-NEXT: successors: %bb.6(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1 + ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %40, %bb.5, [[DEF]], %bb.1 ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.6 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.5 (%ir-block.39): + ; GFX11-NEXT: bb.5 (%ir-block.32): ; GFX11-NEXT: successors: %bb.4(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 @@ -232,7 +231,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.4 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.6 (%ir-block.47): + ; GFX11-NEXT: bb.6 (%ir-block.37): ; GFX11-NEXT: $vgpr0 = COPY [[PHI]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 8ee0ee3b27bae..d3944d3d52d77 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1058,13 +1058,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm @@ -1095,10 +1095,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX89-NEXT: .LBB3_2: ; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_readfirstlane_b32 s2, v0 -; GFX89-NEXT: v_readfirstlane_b32 s3, v1 -; GFX89-NEXT: v_mov_b32_e32 v0, s2 -; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_readfirstlane_b32 s2, v1 +; GFX89-NEXT: v_readfirstlane_b32 s3, v0 +; GFX89-NEXT: v_mov_b32_e32 v0, s3 +; GFX89-NEXT: v_mov_b32_e32 v1, s2 ; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX89-NEXT: s_mov_b32 s3, 0xf000 ; GFX89-NEXT: s_mov_b32 s2, -1 @@ -1134,8 +1134,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 @@ -1169,8 +1169,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 @@ -1205,8 +1205,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: .LBB3_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -1242,8 +1242,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: .LBB3_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -1281,8 +1281,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: .LBB3_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 @@ -1318,8 +1318,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: .LBB3_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 @@ -1367,15 +1367,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 ; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s2 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s3, v2 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -1407,10 +1407,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1] @@ -1449,10 +1449,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -1493,8 +1493,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -1534,8 +1534,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 @@ -1576,8 +1576,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 @@ -1622,8 +1622,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s6, -1 @@ -1666,8 +1666,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB4_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 @@ -1706,8 +1706,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB4_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s6, -1 @@ -2925,13 +2925,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm @@ -2961,12 +2961,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -2999,12 +2999,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3291,15 +3291,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 ; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s2 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s3, v2 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -3334,11 +3334,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc @@ -3379,11 +3379,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index af6f69130910d..b0b40aa952a9f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -944,12 +944,12 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -974,11 +974,11 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -1005,11 +1005,11 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -1039,8 +1039,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 @@ -1099,8 +1099,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -1132,8 +1132,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -1182,14 +1182,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s4, s0 ; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 ; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v2 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -1217,10 +1217,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] @@ -1256,10 +1256,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -1296,8 +1296,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1064-NEXT: .LBB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] ; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] @@ -1331,8 +1331,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1032-NEXT: .LBB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] ; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] @@ -1367,8 +1367,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB5_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] @@ -1407,8 +1407,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB5_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] @@ -2444,12 +2444,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2475,12 +2475,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -2507,12 +2507,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc @@ -2696,14 +2696,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s4, s0 ; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 ; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v2 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -2736,11 +2736,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc @@ -2776,11 +2776,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc @@ -4149,8 +4149,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -4249,8 +4249,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] @@ -4280,8 +4280,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] @@ -4312,8 +4312,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: .LBB18_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4345,8 +4345,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: .LBB18_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4663,8 +4663,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -4763,8 +4763,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] @@ -4794,8 +4794,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo ; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] @@ -4826,8 +4826,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: .LBB20_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4859,8 +4859,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: .LBB20_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5177,8 +5177,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 @@ -5208,8 +5208,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] @@ -5240,8 +5240,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] @@ -5274,8 +5274,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] @@ -5305,8 +5305,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] @@ -5337,8 +5337,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5370,8 +5370,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5688,8 +5688,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 @@ -5719,8 +5719,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: .LBB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] @@ -5751,8 +5751,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: .LBB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] @@ -5785,8 +5785,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] @@ -5816,8 +5816,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo ; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] @@ -5848,8 +5848,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: .LBB24_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5881,8 +5881,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: .LBB24_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index ee0910b21f024..60c3328b08c6c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -170,9 +170,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -185,14 +184,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec ; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]] + ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX908-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX908-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: S_BRANCH %bb.2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.2 (%ir-block.35): + ; GFX908-NEXT: bb.2 (%ir-block.31): ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -204,7 +203,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.4 (%ir-block.37): + ; GFX908-NEXT: bb.4 (%ir-block.33): ; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: S_ENDPGM 0 ; @@ -232,9 +231,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -247,14 +245,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]] + ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX90A_GFX940-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_BRANCH %bb.2 ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.35): + ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.31): ; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -266,7 +264,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37): + ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.33): ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_ENDPGM 0 ; @@ -290,9 +288,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $exec_lo ; GFX11_GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX11_GFX12-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec - ; GFX11_GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec @@ -301,15 +298,15 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX11_GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11_GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec ; GFX11_GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX11_GFX12-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: S_BRANCH %bb.2 ; GFX11_GFX12-NEXT: {{ $}} - ; GFX11_GFX12-NEXT: bb.2 (%ir-block.28): + ; GFX11_GFX12-NEXT: bb.2 (%ir-block.24): ; GFX11_GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -320,7 +317,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: {{ $}} - ; GFX11_GFX12-NEXT: bb.4 (%ir-block.30): + ; GFX11_GFX12-NEXT: bb.4 (%ir-block.26): ; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 3454e9d1019e5..8eb38344fc944 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -176,9 +176,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $exec_lo ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec - ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 - ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -187,24 +186,24 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec - ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15 - ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_3]] - ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed [[V_READLANE_B32_]], killed [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]] - ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31 - ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_5]] + ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 15 + ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_2]] + ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed [[V_READLANE_B32_]], killed [[S_MOV_B32_3]], [[V_MOV_B32_dpp5]] + ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 31 + ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]] ; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.2 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.2 (%ir-block.36): + ; GFX11-NEXT: bb.2 (%ir-block.29): ; GFX11-NEXT: successors: %bb.4(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -219,17 +218,17 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.5 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.4 (%ir-block.39): + ; GFX11-NEXT: bb.4 (%ir-block.32): ; GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX11-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec - ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: early-clobber %43:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %43, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.5 (%ir-block.47): + ; GFX11-NEXT: bb.5 (%ir-block.37): ; GFX11-NEXT: $vgpr0 = COPY [[PHI]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index 722c069f90a8c..2397d6c4e8938 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -9,7 +9,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { ; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( ; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR: 2: ; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -28,16 +28,14 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns ; IR-NEXT: br label [[TMP16]] ; IR: 16: ; IR-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) -; IR-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float -; IR-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] -; IR-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] -; IR-NEXT: br label [[TMP24]] -; IR: 24: -; IR-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-NEXT: ret float [[TMP25]] +; IR-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) +; IR-NEXT: [[TMP19:%.*]] = uitofp i32 [[TMP8]] to float +; IR-NEXT: [[TMP20:%.*]] = fmul float [[VAL]], [[TMP19]] +; IR-NEXT: [[TMP21:%.*]] = fadd float [[TMP18]], [[TMP20]] +; IR-NEXT: br label [[TMP22]] +; IR: 22: +; IR-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-NEXT: ret float [[TMP23]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result @@ -46,7 +44,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -57,44 +55,37 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("agent") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]] -; IR-ITERATIVE-NEXT: br label [[TMP18]] -; IR-ITERATIVE: 18: -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = fadd float [[TMP14]], [[TMP21:%.*]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP17]] ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) -; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float -; IR-ITERATIVE-NEXT: [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]] -; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 -; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] -; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) +; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) +; IR-ITERATIVE-NEXT: [[TMP22]] = fadd float [[ACCUMULATOR]], [[TMP20]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -102,43 +93,36 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP11]], [[TMP13]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]] -; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) -; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float -; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) -; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]] -; IR-DPP: 31: -; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd float [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd float [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd float [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = fadd float [[TMP19]], [[TMP20]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd float [[TMP30]], [[TMP31]] ; IR-DPP-NEXT: br label [[TMP33]] ; IR-DPP: 33: -; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] -; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) -; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float -; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) -; IR-DPP-NEXT: [[TMP39:%.*]] = fadd float [[TMP37]], [[TMP38]] -; IR-DPP-NEXT: br label [[TMP40]] -; IR-DPP: 40: -; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ] -; IR-DPP-NEXT: ret float [[TMP41]] +; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret float [[TMP34]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result @@ -147,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7:[0-9]+]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -166,20 +150,18 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un ; IR-ITERATIVE-NEXT: br label [[TMP16]] ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP24]] -; IR-ITERATIVE: 24: -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP22]] +; IR-ITERATIVE: 22: +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP23]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8:[0-9]+]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -198,16 +180,14 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un ; IR-DPP-NEXT: br label [[TMP16]] ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP24]] -; IR-DPP: 24: -; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-DPP-NEXT: ret float [[TMP25]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP22]] +; IR-DPP: 22: +; IR-DPP-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-DPP-NEXT: ret float [[TMP23]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret float %result @@ -216,7 +196,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float %val) #1 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -227,44 +207,37 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("one-as") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP18]] -; IR-ITERATIVE: 18: -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP17]] ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float -; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 -; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] -; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -272,43 +245,36 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float -; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]] -; IR-DPP: 31: -; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP30]], float [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: br label [[TMP33]] ; IR-DPP: 33: -; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] -; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float -; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP40]] -; IR-DPP: 40: -; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ] -; IR-DPP-NEXT: ret float [[TMP41]] +; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret float [[TMP34]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret float %result @@ -317,7 +283,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -336,20 +302,18 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str ; IR-ITERATIVE-NEXT: br label [[TMP16]] ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP24]] -; IR-ITERATIVE: 24: -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP22]] +; IR-ITERATIVE: 22: +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP23]] ; ; IR-DPP-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -368,16 +332,14 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str ; IR-DPP-NEXT: br label [[TMP16]] ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP24]] -; IR-DPP: 24: -; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-DPP-NEXT: ret float [[TMP25]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP22]] +; IR-DPP: 22: +; IR-DPP-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-DPP-NEXT: ret float [[TMP23]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -386,7 +348,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -397,44 +359,37 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("agent") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP18]] -; IR-ITERATIVE: 18: -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP14]], float [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP17]] ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float -; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 -; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] -; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -442,43 +397,36 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float -; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]] -; IR-DPP: 31: -; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP30]], float [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: br label [[TMP33]] ; IR-DPP: 33: -; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] -; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float -; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP40]] -; IR-DPP: 40: -; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ] -; IR-DPP-NEXT: ret float [[TMP41]] +; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret float [[TMP34]] ; %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -487,7 +435,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { ; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( ; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] ; IR: 2: ; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -502,16 +450,14 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) -; IR-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float -; IR-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] -; IR-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]]) -; IR-NEXT: br label [[TMP20]] -; IR: 20: -; IR-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] -; IR-NEXT: ret float [[TMP21]] +; IR-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = uitofp i32 [[TMP8]] to float +; IR-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] +; IR-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP16]]) +; IR-NEXT: br label [[TMP18]] +; IR: 18: +; IR-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-NEXT: ret float [[TMP19]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -520,7 +466,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 { ; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -531,44 +477,37 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("agent") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP28:%.*]]) -; IR-ITERATIVE-NEXT: br label [[TMP18]] -; IR-ITERATIVE: 18: -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP21:%.*]]) +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP17]] ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) -; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float -; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP24]]) -; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 -; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] -; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) +; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) +; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP20]]) +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -576,43 +515,36 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 2139095040) -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.minnum.f32(float [[TMP11]], float [[TMP13]]) -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP15]]) -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP17]]) -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.minnum.f32(float [[TMP18]], float [[TMP19]]) -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.minnum.f32(float [[TMP20]], float [[TMP21]]) -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]]) -; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) -; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float -; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) -; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]] -; IR-DPP: 31: -; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float 0x7FF0000000000000) +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.minnum.f32(float [[TMP9]], float [[TMP10]]) +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.minnum.f32(float [[TMP11]], float [[TMP12]]) +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.minnum.f32(float [[TMP13]], float [[TMP14]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP15]], float [[TMP16]]) +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP17]], float [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.minnum.f32(float [[TMP19]], float [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) +; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.minnum.f32(float [[TMP30]], float [[TMP31]]) ; IR-DPP-NEXT: br label [[TMP33]] ; IR-DPP: 33: -; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] -; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) -; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float -; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) -; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.minnum.f32(float [[TMP37]], float [[TMP38]]) -; IR-DPP-NEXT: br label [[TMP40]] -; IR-DPP: 40: -; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ] -; IR-DPP-NEXT: ret float [[TMP41]] +; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret float [[TMP34]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -621,7 +553,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{ ; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -636,20 +568,18 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns ; IR-ITERATIVE-NEXT: br label [[TMP12]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]] -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP20]] -; IR-ITERATIVE: 20: -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP18]] +; IR-ITERATIVE: 18: +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP19]] ; ; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -664,16 +594,14 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns ; IR-DPP-NEXT: br label [[TMP12]] ; IR-DPP: 12: ; IR-DPP-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP20]] -; IR-DPP: 20: -; IR-DPP-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] -; IR-DPP-NEXT: ret float [[TMP21]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP18]] +; IR-DPP: 18: +; IR-DPP-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-DPP-NEXT: ret float [[TMP19]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -682,7 +610,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float %val) #1{ ; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -693,44 +621,37 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("agent") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP18]] -; IR-ITERATIVE: 18: -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP21:%.*]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP17]] ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float -; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 -; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] -; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP20]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -738,43 +659,36 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -8388608) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP11]], float [[TMP13]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP15]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP17]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP18]], float [[TMP19]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP20]], float [[TMP21]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float -; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]] -; IR-DPP: 31: -; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float 0xFFF0000000000000) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP9]], float [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP11]], float [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP13]], float [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP15]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP17]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP19]], float [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP30]], float [[TMP31]], metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: br label [[TMP33]] ; IR-DPP: 33: -; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] -; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float -; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP37]], float [[TMP38]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP40]] -; IR-DPP: 40: -; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ] -; IR-DPP-NEXT: ret float [[TMP41]] +; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret float [[TMP34]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -783,7 +697,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -802,20 +716,18 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st ; IR-ITERATIVE-NEXT: br label [[TMP16]] ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP24]] -; IR-ITERATIVE: 24: -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP22]] +; IR-ITERATIVE: 22: +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP23]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -834,16 +746,14 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st ; IR-DPP-NEXT: br label [[TMP16]] ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP24]] -; IR-DPP: 24: -; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-DPP-NEXT: ret float [[TMP25]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP22]] +; IR-DPP: 22: +; IR-DPP-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-DPP-NEXT: ret float [[TMP23]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret float %result @@ -852,7 +762,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -863,44 +773,37 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP18]] -; IR-ITERATIVE: 18: -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP17]] ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float -; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 -; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] -; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1 +; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -908,43 +811,36 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float -; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]] -; IR-DPP: 31: -; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] monotonic, align 4 +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP30]], float [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: br label [[TMP33]] ; IR-DPP: 33: -; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] -; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float -; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP40]] -; IR-DPP: 40: -; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ] -; IR-DPP-NEXT: ret float [[TMP41]] +; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ] +; IR-DPP-NEXT: ret float [[TMP34]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret float %result @@ -1061,7 +957,7 @@ define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_st define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { ; IR-LABEL: @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe( ; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR: 2: ; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1080,22 +976,14 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s ; IR-NEXT: br label [[TMP16]] ; IR: 16: ; IR-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 -; IR-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 -; IR-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 -; IR-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) -; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) -; IR-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 -; IR-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 -; IR-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double -; IR-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP8]] to double -; IR-NEXT: [[TMP28:%.*]] = fmul double [[VAL]], [[TMP27]] -; IR-NEXT: [[TMP29:%.*]] = fadd double [[TMP26]], [[TMP28]] -; IR-NEXT: br label [[TMP30]] -; IR: 30: -; IR-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] -; IR-NEXT: ret double [[TMP31]] +; IR-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) +; IR-NEXT: [[TMP19:%.*]] = uitofp i32 [[TMP8]] to double +; IR-NEXT: [[TMP20:%.*]] = fmul double [[VAL]], [[TMP19]] +; IR-NEXT: [[TMP21:%.*]] = fadd double [[TMP18]], [[TMP20]] +; IR-NEXT: br label [[TMP22]] +; IR: 22: +; IR-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-NEXT: ret double [[TMP23]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 ret double %result @@ -1113,7 +1001,7 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_a define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1132,26 +1020,18 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_ ; IR-ITERATIVE-NEXT: br label [[TMP16]] ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP30]] -; IR-ITERATIVE: 30: -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] -; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP22]] +; IR-ITERATIVE: 22: +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP23]] ; ; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1170,22 +1050,14 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_ ; IR-DPP-NEXT: br label [[TMP16]] ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 -; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 -; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 -; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 -; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double -; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP30]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] -; IR-DPP-NEXT: ret double [[TMP31]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP22]] +; IR-DPP: 22: +; IR-DPP-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP23]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -1203,7 +1075,7 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1222,26 +1094,18 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s ; IR-ITERATIVE-NEXT: br label [[TMP16]] ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP30]] -; IR-ITERATIVE: 30: -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] -; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP22]] +; IR-ITERATIVE: 22: +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP23]] ; ; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1260,22 +1124,14 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s ; IR-DPP-NEXT: br label [[TMP16]] ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 -; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 -; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 -; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 -; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double -; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP30]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] -; IR-DPP-NEXT: ret double [[TMP31]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP22]] +; IR-DPP: 22: +; IR-DPP-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP23]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1293,7 +1149,7 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_s define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { ; IR-LABEL: @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe( ; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] ; IR: 2: ; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1308,22 +1164,14 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 -; IR-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 -; IR-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 -; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) -; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) -; IR-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 -; IR-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 -; IR-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double -; IR-NEXT: [[TMP23:%.*]] = uitofp i32 [[TMP8]] to double -; IR-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0x7FF0000000000000, double [[VAL]] -; IR-NEXT: [[TMP25:%.*]] = call double @llvm.minnum.f64(double [[TMP22]], double [[TMP24]]) -; IR-NEXT: br label [[TMP26]] -; IR: 26: -; IR-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] -; IR-NEXT: ret double [[TMP27]] +; IR-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = uitofp i32 [[TMP8]] to double +; IR-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], double 0x7FF0000000000000, double [[VAL]] +; IR-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP14]], double [[TMP16]]) +; IR-NEXT: br label [[TMP18]] +; IR: 18: +; IR-NEXT: [[TMP19:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-NEXT: ret double [[TMP19]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1341,7 +1189,7 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_s define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{ ; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1356,26 +1204,18 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_ ; IR-ITERATIVE-NEXT: br label [[TMP12]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]] -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP26]] -; IR-ITERATIVE: 26: -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret double [[TMP27]] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP18]] +; IR-ITERATIVE: 18: +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP19]] ; ; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1390,22 +1230,14 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_ ; IR-DPP-NEXT: br label [[TMP12]] ; IR-DPP: 12: ; IR-DPP-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 -; IR-DPP-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-DPP-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 -; IR-DPP-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 -; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 -; IR-DPP-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 -; IR-DPP-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double -; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]] -; IR-DPP-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP26]] -; IR-DPP: 26: -; IR-DPP-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] -; IR-DPP-NEXT: ret double [[TMP27]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP18]] +; IR-DPP: 18: +; IR-DPP-NEXT: [[TMP19:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-DPP-NEXT: ret double [[TMP19]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1423,7 +1255,7 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-ITERATIVE: 2: ; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1442,26 +1274,18 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_ ; IR-ITERATIVE-NEXT: br label [[TMP16]] ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 -; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: br label [[TMP30]] -; IR-ITERATIVE: 30: -; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] -; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP22]] +; IR-ITERATIVE: 22: +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP23]] ; ; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -1480,22 +1304,14 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_ ; IR-DPP-NEXT: br label [[TMP16]] ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 -; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 -; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 -; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 -; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double -; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: br label [[TMP30]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] -; IR-DPP-NEXT: ret double [[TMP31]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP22]] +; IR-DPP: 22: +; IR-DPP-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP23]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 ret double %result diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll index 38823681d1bb5..d1e50bd560cb2 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -59,27 +59,25 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) # ; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 8: -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP14:%.*]] seq_cst, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] ; IR-ITERATIVE: 10: ; IR-ITERATIVE-NEXT: ret void ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP14]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP17:%.*]], [[COMPUTELOOP]] ] ; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]]) -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float -; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 -; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[DIVVALUE]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP14]] = fadd float [[ACCUMULATOR]], [[TMP13]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], -1 +; IR-ITERATIVE-NEXT: [[TMP17]] = and i64 [[ACTIVEBITS]], [[TMP16]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP18]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP19]], label [[TMP8:%.*]], label [[TMP10]] ; ; IR-DPP-LABEL: @global_atomic_fadd_div_value( ; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() @@ -90,32 +88,27 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) # ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 ; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) ; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -2147483648) -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float -; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP12:%.*]] = fadd float [[TMP9]], [[TMP11]] -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP12]], i32 274, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP12]], [[TMP13]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 276, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 280, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 322, i32 10, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]] -; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 -; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63) -; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float -; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) -; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-DPP-NEXT: br i1 [[TMP27]], label [[TMP28:%.*]], label [[TMP30:%.*]] -; IR-DPP: 28: -; IR-DPP-NEXT: [[TMP29:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP26]] seq_cst, align 4 -; IR-DPP-NEXT: br label [[TMP30]] -; IR-DPP: 30: +; IR-DPP-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[DIVVALUE]], float -0.000000e+00) +; IR-DPP-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP7]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd float [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd float [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd float [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP19]], i32 63) +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP22]], label [[TMP23:%.*]], label [[TMP25:%.*]] +; IR-DPP: 23: +; IR-DPP-NEXT: [[TMP24:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP21]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP25]] +; IR-DPP: 25: ; IR-DPP-NEXT: ret void ; %id.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -181,27 +174,25 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) # ; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 8: -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP14:%.*]] seq_cst, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] ; IR-ITERATIVE: 10: ; IR-ITERATIVE-NEXT: ret void ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP14]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP17:%.*]], [[COMPUTELOOP]] ] ; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]]) -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float -; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 -; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[DIVVALUE]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP14]] = fadd float [[ACCUMULATOR]], [[TMP13]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], -1 +; IR-ITERATIVE-NEXT: [[TMP17]] = and i64 [[ACTIVEBITS]], [[TMP16]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP18]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP19]], label [[TMP8:%.*]], label [[TMP10]] ; ; IR-DPP-LABEL: @global_atomic_fsub_div_value( ; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() @@ -212,32 +203,27 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) # ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 ; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) ; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -2147483648) -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float -; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP12:%.*]] = fadd float [[TMP9]], [[TMP11]] -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP12]], i32 274, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP12]], [[TMP13]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 276, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 280, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 322, i32 10, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]] -; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 -; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63) -; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float -; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) -; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-DPP-NEXT: br i1 [[TMP27]], label [[TMP28:%.*]], label [[TMP30:%.*]] -; IR-DPP: 28: -; IR-DPP-NEXT: [[TMP29:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP26]] seq_cst, align 4 -; IR-DPP-NEXT: br label [[TMP30]] -; IR-DPP: 30: +; IR-DPP-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[DIVVALUE]], float -0.000000e+00) +; IR-DPP-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP7]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd float [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd float [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd float [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP19]], i32 63) +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP22]], label [[TMP23:%.*]], label [[TMP25:%.*]] +; IR-DPP: 23: +; IR-DPP-NEXT: [[TMP24:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP21]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP25]] +; IR-DPP: 25: ; IR-DPP-NEXT: ret void ; %id.x = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index fa66a0fdc76ce..d6edba001fb13 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -49,33 +49,31 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("agent") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: br label [[TMP13]] ; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float -; IR-ITERATIVE-NEXT: [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 -; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17]] = fadd float [[ACCUMULATOR]], [[TMP16]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -83,34 +81,29 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP11]], [[TMP13]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]] -; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) -; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float -; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) -; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP32]] -; IR-DPP: 32: -; IR-DPP-NEXT: br label [[TMP33]] -; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd float [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd float [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd float [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = fadd float [[TMP19]], [[TMP20]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 @@ -184,33 +177,31 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("one-as") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: br label [[TMP13]] ; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float -; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 -; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -218,34 +209,29 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float -; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP32]] -; IR-DPP: 32: -; IR-DPP-NEXT: br label [[TMP33]] -; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic @@ -319,33 +305,31 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("agent") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: br label [[TMP13]] ; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float -; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 -; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -353,34 +337,29 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float -; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP32]] -; IR-DPP: 32: -; IR-DPP-NEXT: br label [[TMP33]] -; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic @@ -426,33 +405,31 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("agent") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: br label [[TMP13]] ; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float -; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP18]]) -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 -; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP16]]) +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -460,34 +437,29 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 2139095040) -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.minnum.f32(float [[TMP11]], float [[TMP13]]) -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP15]]) -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP17]]) -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.minnum.f32(float [[TMP18]], float [[TMP19]]) -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.minnum.f32(float [[TMP20]], float [[TMP21]]) -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]]) -; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) -; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float -; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) -; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP32]] -; IR-DPP: 32: -; IR-DPP-NEXT: br label [[TMP33]] -; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float 0x7FF0000000000000) +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.minnum.f32(float [[TMP9]], float [[TMP10]]) +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.minnum.f32(float [[TMP11]], float [[TMP12]]) +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.minnum.f32(float [[TMP13]], float [[TMP14]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP15]], float [[TMP16]]) +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP17]], float [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.minnum.f32(float [[TMP19]], float [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic @@ -553,33 +525,31 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("agent") monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: br label [[TMP13]] ; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float -; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 -; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -587,34 +557,29 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -8388608) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP11]], float [[TMP13]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP15]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP17]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP18]], float [[TMP19]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP20]], float [[TMP21]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float -; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP32]] -; IR-DPP: 32: -; IR-DPP-NEXT: br label [[TMP33]] -; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float 0xFFF0000000000000) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP9]], float [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP11]], float [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP13]], float [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP15]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP17]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP19]], float [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic @@ -688,33 +653,31 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str ; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] ; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] monotonic, align 4 ; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: br label [[TMP13]] ; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void ; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float -; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 -; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] ; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( ; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] ; IR-DPP: 2: ; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -722,34 +685,29 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str ; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 ; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float -; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] -; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float -; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]] -; IR-DPP: 30: -; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP32]] -; IR-DPP: 32: -; IR-DPP-NEXT: br label [[TMP33]] -; IR-DPP: 33: +; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 6555ceb3ed338..04df04a5c299b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -706,35 +706,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -1910,35 +1912,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -3114,35 +3118,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -3834,35 +3840,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -5037,35 +5045,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 7f052e1ae8b55..9f27314cc3909 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -607,42 +607,44 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -1726,42 +1728,44 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -2905,42 +2909,44 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9f49ad1508d5..f16f61159fc30 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -607,42 +607,44 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -1726,42 +1728,44 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -2905,42 +2909,44 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 5cb57703c01d9..64650e2733a00 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -784,35 +784,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -2014,35 +2016,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -3244,35 +3248,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -4016,35 +4022,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -5245,35 +5253,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX9-DPP-NEXT: s_not_b64 exec, exec ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 From 5423d96742b991fdbb32361251a274fc52141849 Mon Sep 17 00:00:00 2001 From: Vikram Date: Mon, 1 Jul 2024 08:28:23 -0400 Subject: [PATCH 2/2] review comment --- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index e3419a73e50fb..f9c978c5a0309 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -886,12 +886,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // lane) to all other lanes in the wavefront. We use an intrinsic for this, // but have to handle 64-bit broadcasts with two calls to this intrinsic. Value *BroadcastI = nullptr; - - if (TyBitWidth == 32 || TyBitWidth == 64) { - BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI); // Now that we have the result of our single atomic operation, we need to // get our individual lane's slice into the result. We use the lane offset